Regression, Fitting, and Minimization

The functions listed here are defined in regression.c, gaussj.c, sparselsqr.c, amoeba.c, and circlefit.c. They are declared in cfsemshare.h, which includes mrcslice.h and is included by b3dutil.h.

Functions for Linear Regression

void statMatrices(float *x, int xsize, int colFast, int m, int msize, int ndata, float *sx, float *ss, float *ssd, float *d, float *r, float *xm, float *sd, int ifdisp)
void statmatrices(float *x, int *xsize, int *colFast, int *m, int *msize, int *ndata, float *sx, float *ss, float *ssd, float *d, float *r, float *xm, float *sd, int *ifdisp)
int multRegress(float *x, int xSize, int colFast, int numInpCol, int numData, int numOutCol, int wgtCol, float *sol, int solSize, float *cons, float *xMean, float *xSD, float *work)
int multregressnoc(float *x, int *xSize, int *colFast, int *numInpCol, int *numData, int *numOutCol, int *wgtCol, float *sol, int *solSize, float *xMean, float *xSD, float *work)
int polynomialFit(float *x, float *y, int ndata, int order, float *slopes, float *intcpt, float *work)
int weightedPolyFit(float *x, float *y, float *weight, int ndata, int order, float *slopes, float *intcpt, float *work)
int robustRegress(float *x, int xSize, int colFast, int numInpCol, int numData, int numOutCol, float *sol, int solSize, float *cons, float *xMean, float *xSD, float *work, float kfactor, int *numIter, int maxIter, int maxZeroWgt, float maxChange, float maxOscill)
int robustregressnoc(float *x, int *xSize, int *colFast, int *numInpCol, int *numData, int *numOutCol, float *sol, int *solSize, float *xMean, float *xSD, float *work, float *kfactor, int *numIter, int *maxIter, int *maxZeroWgt, float *maxChange, float *maxOscillate)

Gauss-Jordan Function for Solving Linear Equations

int gaussj(float *a, int n, int np, float *b, int m, int mp)
int gaussjDet(float *a, int n, int np, float *b, int m, int mp, float *determ)

Sparse Matrix Functions to Use with Lsqr

void sparseProd(int mode, int m, int n, double x[], double y[], void *UsrWrk)
void addValueToRow(float val, int icol, float *valRow, int *icolRow, int *numInRow)
void addvaluetorow(float *val, int *icol, float *valRow, int *icolRow, int *numInRow)
int addRowToMatrix(float *valRow, int *icolRow, int numInRow, float *rwrk, int *ia, int *ja, int *numRows, int maxRows, int maxVals)
int addrowtomatrix(float *valRow, int *icolRow, int *numInRow, float *rwrk, int *ia, int *ja, int *numRows, int *maxRows, int *maxVals)
void normalizeColumns(float *rwrk, int *ia, int *ja, int numVars, int numRows, float *sumEntries)
void normalizecolumns(float *rwrk, int *ia, int *ja, int *numVars, int *numRows, float *sumEntries)

Simplex Search Functions

void amoeba(float *p, float *y, int mp, int ndim, float ftol, void (*funk)(float *, float *), int *iterP, float *ptol, int *iloP)
void amoebaInit(float *p, float *y, int mp, int ndim, float delfac, float ptolFac, float *a, float *da, void (*funk)(float *, float *), float *ptol)
void dualAmoeba(float *y, int ndim, float delfac, float *ptolFacs, float *ftolFacs, float *a, float *da, void (*funk)(float *, float *), int *iterP)

Function for Minimization Search in 1D

int minimize1D(float curPosition, float curValue, float initialStep, int numScanSteps, int *numCutsDone, float *brackets, float *nextPosition)
int minimize1d(float *curPosition, float *curValue, float *initialStep, int *numScanSteps, int *numCutsDone, float *brackets, float *nextPosition)

Functions for Fitting Circles and Spheres

int circleThrough3Pts(float x1, float y1, float x2, float y2, float x3, float y3, float *rad, float *xc, float *yc)
int fitSphere(float *xpt, float *ypt, float *zpt, int numPts, float *rad, float *xcen, float *ycen, float *zcen, float *rmsErr)
int fitSphereWgt(float *xpt, float *ypt, float *zpt, float *weights, int numPts, float *rad, float *xcen, float *ycen, float *zcen, float *rmsErr)
void fitcircle(float *xpt, float *ypt, int *numPts, float *rad, float *xcen, float *ycen, float *rmsErr)
void fitcirclewgt(float *xpt, float *ypt, float *weights, int *numPts, float *rad, float *xcen, float *ycen, float *rmsErr)
void enableRadiusFitting(int doFit)
void enableradiusfitting(int *doFit)
int fitCenteredEllipse(float *xpt, float *ypt, int numPts, float *xrad, float *yrad, float *theta, float *rmsErr, float *work)

Functions for Linear Regression

The functions in this section all have Fortran wrappers with the same names and arguments.

void statMatrices(float x, int xsize, int colFast, int m, int msize, int ndata, float sx, float ss, float ssd, float d, float r, float xm, float sd, int ifdisp)

Computes basic statistical values and matrices from a data matrix representing a series of measurements of multiple variables.
Input parameters:
x       - Input data matrix
xsize   - Size of the fastest-progressing dimension of x
colFast - Nonzero if the column dimension is the fastest progressing one
m       - Number of columns of data, i.e., number of parameters
msize   - Size of one dimension of the various square output matrices
ndata   - Number of rows of data; i.e., number of separate measurements
ifdisp  - Value indicating either to skip computation of d and r (if 0), or to treat the m+1 column of data as weighting values (if < 0)
Outputs: sx      - Array for the sums of the m data columns
ss      - Array for raw sums of squares and cross-products
ssd     - Array for sums of deviation squares and cross-products
d       - Array for variances and covariances  (dispersion matrix)
r       - Array for matrix of correlation coefficients
xm      - Array for the means of the m data columns
sd      - Array for the standard deviations of the m data columns
The output matrices will be treated as having leading dimension msize, so they must be at least msize x m in size.  The data element at row i, column j would be accessed as x[i + j * xsize] or x[j][i] from C, or as x(i,j) from Fortran if colFast is 0, or as x[j + i * xsize], x[i][j], or x(j,i) if colFast is nonzero.  When  weighting is used, the returned values are weighted means, and other statistics with the weights incorporated.

void statmatrices(float x, int xsize, int colFast, int m, int msize, int ndata, float sx, float ss, float ssd, float d, float r, float xm, float sd, int ifdisp)

Fortran wrapper for statMatrices

int multRegress(float x, int xSize, int colFast, int numInpCol, int numData, int numOutCol, int wgtCol, float sol, int solSize, float cons, float xMean, float xSD, float work)

Computes a multiple linear regression (least-squares fit) for the relationships between one or more dependent (output) variables and a set of independent (input) variables.
Input parameters:
x        - Input data matrix
xSize    - Size of the fastest-progressing dimension of x
colFast  - Nonzero if the column dimension is the fastest progressing one, i.e. if successive values in the array occur in successive columns
numInpCol  - Number of columns of data for input variables.  This is allowed to be 0
numData   - Number of rows of data; i.e., number of separate measurements
numOutCol - Number of columns of data for output variables; i.e., number of relationships to fit
wgtCol    - Column number with weighting factors if > 0, otherwise no weighting. Columns are numbered from 0 when calling from C, or 1 calling from Fortran.
solSize   - Size of the fastest-progressing dimension of sol, the array/matrix to receive the solutions; must be at least numInpCol
work      - Any array for temporary use whose size must be at least (numInpCol + numOutCol) * (numInpCol + numOutCol)  floating point elements
Outputs:
sol       - Matrix to receive the numOutCol sets of numInpCol coefficients.  Each set is placed in a column of sol, where the row dimension is the fastest progressing one
cons      - Array to receive the numOutCol constant terms of the fits, or NULL to fit an equation with no constant term
xMean     - Array for the means of the numInpCol plus numOutCol data columns
xSD       - Array for the standard deviations of the numInpCol plus numOutCol data columns (these will be far from correct if cons is NULL)
The input variables should be placed in the first numInpCol columns of x and the the output variables in the next numOutCol columns. The data element at row i, column j would be accessed as x[i + j * xSize] or x[j][i] from C, or as x(i,j) from Fortran if colFast is 0, or as x[j + i * xSize], x[i][j], or x(j,i) if colFast is nonzero.
The return value is 1 if wgtCol has an inappropriate value, or 3 if gaussj returns with an error.

int multregressnoc(float x, int xSize, int colFast, int numInpCol, int numData, int numOutCol, int wgtCol, float sol, int solSize, float xMean, float xSD, float work)

Fortran wrapper to call multRegress to fit with no constant term

int polynomialFit(float x, float y, int ndata, int order, float slopes, float intcpt, float *work)

Uses multiple linear regression to fit a polynomial of order order to ndata points whose (x,y) coordinates are in the arrays x and y. It returns the coefficient of x to the i power in the array slopes and a constant term in intcpt.  The equation fit is:
Y = intcpt + slopes[0] * X + slopes[1] * X**2 + ...
work is an array whose size must be at least (order + 1) * (order + 3 + ndata). The return value is the value returned by multRegress.  Note that a Fortran function polyfit in libhvem takes care of allocating work to the needed size and calling the Fortran wrapper to this function.

int weightedPolyFit(float x, float y, float weight, int ndata, int order, float slopes, float intcpt, float work)

Uses multiple linear regression to fit a polynomial of order order to ndata points whose (x,y) coordinates are in the arrays x and y. It returns the coefficient of x to the i power in the array slopes and a constant term in intcpt.  The equation fit is:
Y = intcpt + slopes[0] * X + slopes[1] * X**2 + ...
work is an array whose size must be at least (order + 2) * ndata + (order + 1) * (order + 3)).
The return value is the value returned by multRegress.  This function is untested.

int robustRegress(float x, int xSize, int colFast, int numInpCol, int numData, int numOutCol, float sol, int solSize, float cons, float xMean, float xSD, float work, float kfactor, int *numIter, int maxIter, int maxZeroWgt, float maxChange, float maxOscill)

Computes a robust least squares fit by iteratively computing a weight from the residual for each data point then doing a weighted regression.  The weight is computed by finding the median and normalized median absolute deviation (MADN) of the residual values.  When there are multiple columns of dependent (output) variables, the square root of the sum of the squares of the residuals for the different variables is used. Each residual is standardized by taking (residual - median) / MADN, and the weight is computed from the Tukey bisquare equation using the standardized residual divided by the specified kfactor.  However, with multiple output variables, all residuals are positive and ones less than the median are given a weighting of 1. Input parameters:
x         - Input data matrix
xSize     - Size of the fastest-progressing dimension of x
colFast   - Nonzero if the column dimension is the fastest progressing one
numInpCol  - Number of columns of data for input variables, which is allowed to be 0
numData   - Number of rows of data; i.e., number of separate measurements
numOutCol - Number of columns of data for output variables; i.e., number of relationships to fit.
solSize   - Size of the fastest-progressing dimension of sol, the array/matrix to receive the solutions; must be at least numInpCol
work      - Any array for temporary use whose size must be at least the maximum of (numInpCol + numOutCol) * (numInpCol + numOutCol) and 2 * numData floating point elements
kfactor   - Factor by which to divide the standardized residual value in computing the Tukey bisquare weight.  4.68 is the typical value; a different value may be more appropriate with multiple dependent variables.  If the factor is negative, the absolute value is used, and a list of data row numbers to be given zero initial weights can be passed in the work array.  The first element of work would have the number or rows to initialize, and the following values would have row indexes, numbered from 0 even when calling from Fortran.
maxIter   - Maximum number of iterations, or the negative of the maximum to get trace output on each iteration.  20-50 is probably adequate.  With a negative value, the program outputs the mean and maximum change in weighting and the number of points with weights of 0, between 0 and 0.1, between 0.1 and 0.2, and less than 0.5.
maxZeroWgt - Maximum number of points allowed to have zero weights.  When this number is exceeded on an iteration, the deviations are sorted and the criterion deviation is permanently changed to midway between the deviations for the last point to be retained and first one to be eliminated.
maxChange  - Maximum change in weights from one iteration to the next or across two iterations; the entry should not be smaller than 0.01.
maxOscill  - Maximum change in weights from one iteration to the next, even if oscillating. The iterations are terminated when the biggest change in weights between iterations is less than maxChange, or when it is less than maxOscill and the biggest change across two iterations is less than maxChange.
Outputs:
sol       - Matrix to receive the numOutCol sets of numInpCol coefficients.  Eac set is placed in a column of sol, where the row dimension is the fastest progressing one
cons      - Array to receive the numOutCol constant terms of the fits
xMean     - Array for the means of the numInpCol plus numOutCol data columns
xSD       - Array for the standard deviations of the numInpCol plus numOutCol data columns
numIter - Number of iterations
The input variables should be placed in the first numInpCol columns of x and the output variables in the next numOutCol columns (see multRegress). Final weights are returned in the column of x after the output variables, and the column after that is used for weights on the previous iteration, so x must have at least numInpCol + numOutCol + 2 columns.  The unweighted and weighted root-mean-square residual errors are returned in the first and second elements of work.
The return value is 1 if there are not enough columns for the weights (detectable only when colFast is nonzero), 2 if there is a problem with the specification of rows to be given initial weights of zero in work, 3 if gaussj returns with an error, or -1 if the weights do not converge.
The procedure is described in Beaton, A.E., and Tukey, J.W., 1974, The fitting of power series, meaning polynomials, illustrated on band-spectroscopic data. Technometrics 16: 147-185, and in Gross, A. M., 1977, Confidence intervals for bisquare regression estimates. Journal of the American Statistical Association 72: 341-354.  A value of 4.685 for kfactor gives a 95% efficiency for normally distributed errors and a single output variable (Holland, P.W., and Welsch, R.E., 1977, Robust regression using iteratively reweighted least-squares. Communications in Statistics - Theory and Methods, 6: 813-827).

int robustregressnoc(float x, int xSize, int colFast, int numInpCol, int numData, int numOutCol, float sol, int solSize, float xMean, float xSD, float work, float kfactor, int numIter, int maxIter, int maxZeroWgt, float maxChange, float *maxOscillate)

Fortran wrapper to call robustRegress to fit with no constant term

Gauss-Jordan Function for Solving Linear Equations

int gaussj(float a, int n, int np, float b, int m, int mp)

Solves the linear matrix equation A X = B by Gauss-Jordan elimination. A is a square matrix of size n by n in array a, dimensioned to np columns. B is a matrix with one row per row of A and m columns in array b, dimensioned to mp columns. The columns of b are replaced by the m solution vectors while a is reduced to a unit matrix. It is called the same from C and Fortran, but the matrices must be in row-major order in both cases. Specifically, in C, the matrices are indexed as A[row][column] and in Fortran they are indexed as A(column,row).
The maximum value of n is 2000 but better and faster approaches should be used long before reaching that point. The routine returns -1 if n exceeds this value and 1 if the A matrix is singular.

int gaussjDet(float a, int n, int np, float b, int m, int mp, float *determ)

Version of gaussj that returns a determinant value

Sparse Matrix Functions to Use with Lsqr

void sparseProd(int mode, int m, int n, double x[], double y[], void *UsrWrk)

Compute products in a sparse matrix for lsqr.  The matrix has m data rows times n columns, one for each variable
If mode = 1, compute  y = y + A*x.
If mode = 2, compute  x = x + A(transpose)*y.
UsrWrk is loaded as follows:
Offset to JA values, the column number of each data value (from 1)
Offset to RW values, the data values themselves
IA values: the starting index of each row.  Numbered from 1

void addValueToRow(float val, int icol, float valRow, int icolRow, int *numInRow)

Adds one value val and its data column icol to arrays valRow and icolRow for the current row, and maintains the number of values in numInRow.

void addvaluetorow(float val, int icol, float valRow, int icolRow, int *numInRow)

Fortran wrapper for addValueToRow

int addRowToMatrix(float valRow, int icolRow, int numInRow, float rwrk, int ia, int ja, int numRows, int maxRows, int maxVals)

Adds one data row to a sparse matrix. Values are in valRow, column numbers (numbered from 1) in icolRow, number of values in numInRow. rwrk is the sparse matrix array of data values, ja is the corresponding array of column numbers, ia is an array of starting indexes into those arrays for each row (indexes numbered from 1). The number of rows is maintained in numRows, and maxRows and maxVals are the maximum number of rows and values allowed.

int addrowtomatrix(float valRow, int icolRow, int numInRow, float rwrk, int ia, int ja, int numRows, int maxRows, int *maxVals)

Fortran wrapper for addRowToMatrix

void normalizeColumns(float rwrk, int ia, int ja, int numVars, int numRows, float sumEntries)

Normalizes each column of data in the sparse matrix by dividing it by the square root of the sum of squares of values in that column, as recommended for running lsqr. rwrk, ja, and ia are as described for addRowToMatrix; numVars is the number of variables (columns) and numRows is the number of rows. The normalizing factor for each column is returned in sumEntries. The solution vector obtained from normalized data needs to be divided by sumEntries.

void normalizecolumns(float rwrk, int ia, int ja, int numVars, int numRows, float sumEntries)

Fortran wrapper for normalizeColumns

Simplex Search Functions

void amoeba(float p, float y, int mp, int ndim, float ftol, void (funk)(float , float ), int iterP, float ptol, int iloP)

Performs a multidimensional search to minimize a function of ndim variables using the Nelder-Mead algorithm as elaborated by Lagarius et al. The function value is computed by funk, a void function of two variables: a float array containing the values of the ndim variables, and a pointer to a float for the function value. p is a 2-dimensional array dimensioned to at least one more than the number of variables in each dimension, and mp specifies its fastest dimension (second in C, first in fortran). y is a one-dimensional array also dimensioned to at least one more than the number of variables. Both of these should be preloaded by calling amoebaInit. Termination is controlled by ftol, which is a limit for the fractional difference in function value between the lowest and highest point in the simplex, and the array ptol, which has limits for the difference of each variable; each component of each point must be within the respective limit of the value for the lowest point. iterP is returned with the number of iterations; iloP is returned with the index of the minimum vector in p (p[i][iloP] in C, p(iloP, i) in fortran).
From fortran the subroutine is called as:
call amoeba(p, y, mp, ndim, ftol, funk, iter, ptol, ilo)
where funk is:
subroutine funk(a, value)

void amoebaInit(float p, float y, int mp, int ndim, float delfac, float ptolFac, float a, float da, void (funk)(float , float ), float ptol)

Initializes the arrays p, y, and ptol before calling amoeba. a is an array with the initial values of the variable. da is an array with factors proportional to the magnitude of the components of a. The initial step size for each variable is set to delfac times da, while the termination tolerance for each variable is set to ptolFac times da. Other variables are as just described. From fortran it is called as:
call amoebaInit(p, y, mp, ndim, delfac, ptolFac, a, da, func, ptol)

void dualAmoeba(float y, int ndim, float delfac, float ptolFacs, float ftolFacs, float a, float da, void (funk)(float , float ), int *iterP)

Runs amoebaInit then amoeba twice, in the recommended fashion, starting the second run from the ending point of the first. The two ptolFac values for the first and second runs should be in the array ptolFacs, and the two ftol factors for the runs should be in the array ftolFacs. The sum of iterations on the two runs is returned in iterP. The best fitting variable vector is return in a. Other variables are as described above. This routine can be used as long as ndim is not bigger than MAX_DUAL_AMOEBA_VAR.

Function for Minimization Search in 1D

int minimize1D(float curPosition, float curValue, float initialStep, int numScanSteps, int numCutsDone, float brackets, float *nextPosition)

Guides the search for a minimum value of a one-dimensional function by initially either scanning for a minimum at a given step size for a number of steps, or walking to the local minimum from the starting point, then repeatedly cutting the step size by a factor of 2 once the minimum is bracketed to an interval.
curPosition  - Currently evaluated position
curValue     - Value at the current position
initialStep  - Initial step size for the search (should be positive)
numScanSteps - Number of steps to scan from initial position, if scanning for the global minimum with a range, or 0 to walk to the local minimum
numCutsDone  - Number of times that the step size has been cut by a factor of 2
brackets     - An array of at least 14 elements for storing positions and values
nextPosition - The next position that needs to be evaluated
To use the routine, initialize numCutsDone to -1 and curPosition at a starting value.  For doing a scan, the starting value would be the low end of the range to be scanned; otherwise it should be the most plausible value from which a minimum can be reached.  Then enter a loop to:
Evaluate the function at curPosition to obtain curValue,
Call this routine,
Test whether the return value is 2 if scanning, or test whether nextPosition is out of range if not scanning, and break the loop with failure to find a minimum,
Test whether numCuts has reached a desired limit, and break the loop with completion,
Assign nextPosition to curPosition or pass curPosition for that argument.
The position that gives the minimum is maintained in brackets[1] and the minimum value is in brackets[8].  The return value is 1 if brackets[13] is not -1, 1, or 2 as set on a previous call, or if the current position is out of range for a scan. The return value is 2 if an initial scan fails to find an absolute minimum.

int minimize1d(float curPosition, float curValue, float initialStep, int numScanSteps, int numCutsDone, float brackets, float *nextPosition)

Fortran wrapper for minimize1D.

Functions for Fitting Circles and Spheres

int circleThrough3Pts(float x1, float y1, float x2, float y2, float x3, float y3, float rad, float xc, float *yc)

Computes the radius rad and center (xc, yc) for a circle through the 3 given points (x1, y1), (x2, y2), and (x3, y3). Returns 1 for error (square root of negative number).

int fitSphere(float xpt, float ypt, float zpt, int numPts, float rad, float xcen, float ycen, float zcen, float rmsErr)

Fit a circle or sphere to a set of points using a simplex search with the amoeba routine. By default, it finds the radius as well as the center, unless enableRadiusFitting has been called with a 0. The routine is thread-safe but cannot be used with more than 64 threads.
Inputs: X and Y coordinates in arrays xpt, ypt; zpt is NULL for a circle fit or has the array of Z coordinate; number of points in numPts.
Inputs/Outputs: radius in rad, center coordinate in xcen, ycen, and zcen for a sphere fit; these must be initialized with reasonable starting values for a search.
Output: RMS error in rmsErr. Returns 0.

int fitSphereWgt(float xpt, float ypt, float zpt, float weights, int numPts, float rad, float xcen, float ycen, float zcen, float *rmsErr)

Fit a circle or sphere to a set of points with weighting of errors using a simplex search with the amoeba routine. By default, it finds the radius as well as the center, unless enableRadiusFitting has been called with a 0. The routine is thread-safe but cannot be used with more than 64 threads.
Inputs: X and Y coordinates in arrays xpt, ypt; zpt is NULL for a circle fit or has the array of Z coordinate; weights has an array of weights or is NULL for no weighting; the number of points is in numPts.
Inputs/Outputs: radius in rad, center coordinate in xcen, ycen, and zcen for a sphere fit; these must be initialized with reasonable starting values for a search.
Output: RMS error in rmsErr. Returns 0.

void fitcircle(float xpt, float ypt, int numPts, float rad, float xcen, float ycen, float *rmsErr)

Fortran wrapper to fitSphere for fitting a circle

void fitcirclewgt(float xpt, float ypt, float weights, int numPts, float rad, float xcen, float ycen, float rmsErr)

Fortran wrapper to fitSphereWgt for fitting a circle with weights

void enableRadiusFitting(int doFit)

Makes the fitting routines find a center position only using the radius provided when doFit is nonzero. Call with 1 to disable finding the radius and 0 to re-enable it.

void enableradiusfitting(int *doFit)

Fortran wrapper to enableRadiusFitting

int fitCenteredEllipse(float xpt, float ypt, int numPts, float xrad, float yrad, float theta, float rmsErr, float *work)

Fits an ellipse centered at the origin to the numPts points in xpt, ypt and returns the rotation counterclockwise in degrees in theta, the semi-axes along the X and Y axes before rotation in xrad and yrad, and the RMS error in rmsErr. The array work must be dimensioned to at least 3 times numPts. The error computed is the minimum distance from each point to the ellipse, and the parameters are searched in a simplex fit to minimize the sum of squares. The distance for each point is returned in work, followed by their angles in radians then by their radial distances from the origin. Returns 1 if the simplex fit failed to change from the initial rough estimate of the parameters.