/* * framealign.cpp - module for aligning movie frames passed in sequentially * * Author: David Mastronarde email: mast@colorado.edu * * Copyright (C) 2016 by the Regents of the University of * Colorado. See dist/COPYRIGHT for full copyright notice. * * No ID line: it is shared between 3 different projects */ #include #include #include #if defined(_WIN32) && defined(DELAY_LOAD_FGPU) #include #include #ifndef _delayimp_h extern "C" IMAGE_DOS_HEADER __ImageBase; #endif #endif #include "b3dutil.h" #include "cppdefs.h" #include "cfft.h" #include "framealign.h" #include "frameutil.h" #define START_TIMER if (mReportTimes) mWallStart = wallTime(); #define ADD_TIME(a) if (mReportTimes) a += wallTime() - mWallStart; #if defined(_WIN32) && defined(DELAY_LOAD_FGPU) #define GET_PROC(t, s, n) s = (t)GetProcAddress(sGpuModule, #n); if (!s) err++; #define GPU_DLL_NAME "FrameGPU.dll" typedef void (*FgpuNoArgType)(void); typedef int (*FgpuTwoIntType)(int, int); typedef int (*ReturnIntType)(void); typedef void (*ReturnTwoIntType)(int *, int *); typedef void (*SetGroupSizeType)(int); typedef int (*GpuAvailableType)(int, float *, int); typedef void (*SetUnpaddedSizeType)(int, int , int, int); typedef int (*SetPreProcParamsType)(float *, int, int, float, unsigned char *, int, int); typedef void (*SetBinPadParamsType)(int, int, int, int, int, int, int, int, int, int); typedef int (*SetupSummingType)(int, int, int, int, int); typedef int (*SetupAligningType)(int, int, int, int, float *, int, int, int, int); typedef int (*SetupDoseWeightingType)(float *, int, float); typedef int (*AddToFullSumType)(float *, float, float); typedef int (*ReturnSumsType)(float *, float *, float *, int); typedef int (*NewFilterMaskType)(float *); typedef int (*ShiftAddToAlignSumType)(int, float, float, int); typedef int (*CrossCorrelateType)(int, int, float *, int, int); typedef int (*ProcessAlignImageType)(float *, int, int, int); typedef int (*ReturnAlignFFTsType)(float **, float **, float *, float *); typedef int (*ReturnStackedFrameType)(float *, int *); typedef void (*SetPrintFuncType)(CharArgType func); static HMODULE sGpuModule; static GpuAvailableType sFgpuGpuAvailable; static SetUnpaddedSizeType sFgpuSetUnpaddedSize; static SetPreProcParamsType sFgpuSetPreProcParams; static SetBinPadParamsType sFgpuSetBinPadParams; static SetupSummingType sFgpuSetupSumming; static SetupAligningType sFgpuSetupAligning; static SetupDoseWeightingType sFgpuSetupDoseWeighting; static AddToFullSumType sFgpuAddToFullSum; static ReturnSumsType sFgpuReturnSums; static NewFilterMaskType sFgpuReturnUnweightedSum; static FgpuNoArgType sFgpuCleanup; static FgpuNoArgType sFgpuRollAlignStack; static FgpuNoArgType sFgpuRollGroupStack; static FgpuTwoIntType sFgpuSubtractAndFilterAlignSum; static NewFilterMaskType sFgpuNewFilterMask; static ShiftAddToAlignSumType sFgpuShiftAddToAlignSum; static CrossCorrelateType sFgpuCrossCorrelate; static ProcessAlignImageType sFgpuProcessAlignImage; static ReturnTwoIntType sFgpuNumberOfAlignFFTs; static ReturnAlignFFTsType sFgpuReturnAlignFFTs; static ReturnStackedFrameType sFgpuReturnStackedFrame; static FgpuNoArgType sFgpuCleanSumItems; static FgpuNoArgType sFgpuCleanAlignItems; static FgpuNoArgType sFgpuZeroTimers; static FgpuNoArgType sFgpuPrintTimers; static ReturnIntType sFgpuClearAlignSum; static FgpuTwoIntType sFgpuSumIntoGroup; static SetGroupSizeType sFgpuSetGroupSize; static ReturnIntType sFgpuGetVersion; static SetPrintFuncType sFgpuSetPrintFunc; #else #define sFgpuGpuAvailable fgpuGpuAvailable #define sFgpuSetUnpaddedSize fgpuSetUnpaddedSize #define sFgpuSetPreProcParams fgpuSetPreProcParams #define sFgpuSetBinPadParams fgpuSetBinPadParams #define sFgpuSetupSumming fgpuSetupSumming #define sFgpuSetupAligning fgpuSetupAligning #define sFgpuSetupDoseWeighting fgpuSetupDoseWeighting #define sFgpuAddToFullSum fgpuAddToFullSum #define sFgpuReturnSums fgpuReturnSums #define sFgpuReturnUnweightedSum fgpuReturnUnweightedSum #define sFgpuCleanup fgpuCleanup #define sFgpuRollAlignStack fgpuRollAlignStack #define sFgpuRollGroupStack fgpuRollGroupStack #define sFgpuSubtractAndFilterAlignSum fgpuSubtractAndFilterAlignSum #define sFgpuNewFilterMask fgpuNewFilterMask #define sFgpuShiftAddToAlignSum fgpuShiftAddToAlignSum #define sFgpuCrossCorrelate fgpuCrossCorrelate #define sFgpuProcessAlignImage fgpuProcessAlignImage #define sFgpuNumberOfAlignFFTs fgpuNumberOfAlignFFTs #define sFgpuReturnAlignFFTs fgpuReturnAlignFFTs #define sFgpuReturnStackedFrame fgpuReturnStackedFrame #define sFgpuCleanSumItems fgpuCleanSumItems #define sFgpuCleanAlignItems fgpuCleanAlignItems #define sFgpuZeroTimers fgpuZeroTimers #define sFgpuPrintTimers fgpuPrintTimers #define sFgpuClearAlignSum fgpuClearAlignSum #define sFgpuSumIntoGroup fgpuSumIntoGroup #define sFgpuSetGroupSize fgpuSetGroupSize #define sFgpuSetPrintFunc fgpuSetPrintFunc #endif /* * Constructor: Initialized all the pointers and call cleanup routine for other * variables */ FrameAlign::FrameAlign() { int i; mFullEvenSum = NULL; mFullOddSum = NULL; mUnweightSum = NULL; mAlignSum = NULL; mWorkFullSize = NULL; mWorkBinPad = NULL; mCorrBinPad = NULL; mCorrFiltTemp = NULL; mShiftTemp = NULL; mLinePtrs = NULL; mReduceTemp = NULL; mFitMat = NULL; mFitWork = NULL; mFullFiltMask = NULL; mTempSubFilt = NULL; mWrapTemp = NULL; mDumpInd = 0; for (i = 0; i < MAX_FILTERS; i++) mSubFiltMask[i] = NULL; mPickDiffCrit = 5; mPickRatioCrit = 3.; mFailedOftenCrit = 3; mGpuFlags = 0; mGroupSizeInitial = 1; mGpuLibLoaded = -1; mPrintFunc = NULL; cleanup(); } // 1/3/19: tried to free library in cleanup so that it could be replaced wih DM running // but DM holds on to all DLL's in the Plugins directory FrameAlign::~FrameAlign(void) { #if defined(_WIN32) && defined(DELAY_LOAD_FGPU) if (mGpuLibLoaded > 0) FreeLibrary(sGpuModule); #endif } void FrameAlign::setPrintFunc(CharArgType func) { mPrintFunc = func; utilSetPrintFunc(func); if (mGpuLibLoaded > 0) sFgpuSetPrintFunc(func); } /* * Initialize before sending a set of frames */ int FrameAlign::initialize(int binSum, int binAlign, float trimFrac, int numAllVsAll, int cumAlignAtEnd, int useHybrid, int deferSum, int groupSize, int nx, int ny, float padFrac, float taperFrac, int antiFiltType, float radius1, float *radius2, float sigma1, float *sigma2, int numFilters, int maxShift, float kFactor, float maxMaxWeight, int summingMode, int expectedZ, int makeUnwgtSum, int gpuFlags, int debug) { int alignXpad, alignYpad, sumXpad, sumYpad, fullXpad, fullYpad, ind, filt; int divisor, niceLimit, nxPad, nyPad, nxTrim, nyTrim, nxUse, nyUse; int expectStack, gpuStackLimit, aliFiltSize, minFiltSize = 128; int xReduceTemp = 0, yReduceTemp = 0; bool doBinPad = trimFrac != 0. || taperFrac != 0.; // Default calls from alignframes/SEM have taperfrac (for align) 0.1, // fullTaperFrac -> padFrac for full image at 0.02, and trimFrac of 0.1 // Make just a full filter mask with high-frequency included if only one filter, // no GPU, and not refining at end bool justFullFilt = numFilters == 1 && !cumAlignAtEnd && (gpuFlags & GPU_FOR_ALIGNING) == 0; int niceGpuLimit = 5; bool noisePadOnGpu, binPadOnGpu, stackOnGpu; mDebug = debug % 10; mReportTimes = (debug / 10) % 10 != 0; mDumpCorrs = (debug / 100) % 10 != 0; mDumpRefCorrs = (debug / 1000) % 10 != 0; mDumpEvenOdd = (debug / 10000) % 10 != 0; if (numAllVsAll < 2 + groupSize) numAllVsAll = 0; if (numAllVsAll > MAX_ALL_VS_ALL || numFilters > MAX_FILTERS || (!numAllVsAll && numFilters > 1) || (gpuFlags && !doBinPad)) return 1; for (filt = 0; filt < numFilters; filt++) { CLEAR_RESIZE(mXallShifts[filt], float, numAllVsAll * numAllVsAll); CLEAR_RESIZE(mYallShifts[filt], float, numAllVsAll * numAllVsAll); mRadius2[filt] = radius2[filt]; mSigma2[filt] = sigma2[filt]; } for (filt = 0; filt <= numFilters; filt++) { CLEAR_RESIZE(mXshifts[filt], float, 0); CLEAR_RESIZE(mYshifts[filt], float, 0); mResMeanSum[filt] = mResSDsum[filt] = mMaxResMax[filt] = 0.; mResMaxSum[filt] = mMaxRawMax[filt] = mRawMaxSum[filt] = 0.; mCumulXdiff[filt] = mCumulYdiff[filt] = 0.; mPredMeanSum[filt] = 0.; } divisor = binSum; if (!doBinPad) divisor = leastCommonMultiple(binSum, binAlign); divisor *= 2; niceLimit = niceFFTlimit(); if (gpuFlags) niceLimit = niceGpuLimit; // Get size of full as a multiple of the necessary divisor nxPad = (int)B3DMAX(32, padFrac * nx); nyPad = (int)B3DMAX(32, padFrac * ny); fullXpad = divisor * ((nx + 2 * nxPad + divisor - 1) / divisor); fullYpad = divisor * ((ny + 2 * nyPad + divisor - 1) / divisor); fullXpad = niceFrame(fullXpad, divisor, niceLimit); fullYpad = niceFrame(fullYpad, divisor, niceLimit); mAntiFiltType = antiFiltType; if (doBinPad && binAlign > 1 && selectZoomFilter(antiFiltType, 1. / binAlign, &ind)) return 3; // Manage things when there is change in number of all-vs-all if (numAllVsAll != mNumAllVsAll || summingMode != mSummingMode || cumAlignAtEnd != mCumAlignAtEnd || useHybrid != mUseHybrid || groupSize != mGroupSizeInitial) { for (ind = 0; ind < (int)mSavedBinPad.size(); ind++) B3DFREE(mSavedBinPad[ind]); for (ind = 0; ind < (int)mSavedFullSize.size(); ind++) B3DFREE(mSavedFullSize[ind]); for (ind = 0; ind < (int)mSavedGroups.size(); ind++) B3DFREE(mSavedGroups[ind]); mSavedBinPad.clear(); mSavedFullSize.clear(); mSavedFullFrameNum.clear(); mSavedGroups.clear(); B3DFREE(mFitMat); B3DFREE(mFitWork); if (numAllVsAll) { mFitMat = B3DMALLOC(float, ((numAllVsAll + 3) * (numAllVsAll - 1) * numAllVsAll) / 2); mFitWork = B3DMALLOC(float, (numAllVsAll + 2) * (numAllVsAll + 2)); if (testAndCleanup(!mFitMat || !mFitWork)) return 2; } } if (fullXpad != mFullXpad || fullYpad != mFullYpad || numAllVsAll != mNumAllVsAll || summingMode != mSummingMode) { B3DFREE(mShiftTemp); B3DFREE(mWorkFullSize); mWorkFullSize = B3DMALLOC(float, (fullXpad + 2) * fullYpad); mShiftTemp = B3DMALLOC(float, 2 * B3DMAX(nx, ny) + (fullXpad - nx) + (fullYpad - ny)); mLinePtrs = B3DMALLOC(unsigned char *, fullYpad); if (testAndCleanup(!mWorkFullSize || !mShiftTemp || !mLinePtrs)) return 2; mFullXpad = fullXpad; mFullYpad = fullYpad; } // And size of final sum(s) sumXpad = fullXpad / binSum; sumYpad = fullYpad / binSum; if (sumXpad != mSumXpad || sumYpad != mSumYpad || summingMode != mSummingMode) { B3DFREE(mFullEvenSum); B3DFREE(mFullOddSum); if (summingMode <= 0) { mFullEvenSum = B3DMALLOC(float, (sumXpad + 2) * sumYpad); mFullOddSum = B3DMALLOC(float, (sumXpad + 2) * sumYpad); if (testAndCleanup(!mFullEvenSum || !mFullOddSum)) return 2; } } if (sumXpad != mSumXpad || sumYpad != mSumYpad || makeUnwgtSum != mMakeUnwgtSum) { B3DFREE(mUnweightSum); if (makeUnwgtSum) { mUnweightSum = B3DMALLOC(float, (sumXpad + 2) * sumYpad); if (testAndCleanup(!mUnweightSum)) return 2; } } mSumXpad = sumXpad; mSumYpad = sumYpad; // And size of align sum if (!doBinPad) { alignXpad = fullXpad / binAlign; alignYpad = fullYpad / binAlign; } else { nxTrim = (int)(trimFrac * nx); nyTrim = (int)(trimFrac * ny); nxUse = 2 * binAlign * ((nx - 2 * nxTrim) / (2 * binAlign)); nyUse = 2 * binAlign * ((ny - 2 * nyTrim) / (2 * binAlign)); if (mDebug) utilPrint("nxTrim = %d nyTrim = %d nxUse = %d nyUse = %d\n", nxTrim, nyTrim, nxUse, nyUse); mXstart = (nx - nxUse) / 2; mXend = mXstart + nxUse - 1; mYstart = (ny - nyUse) / 2; mYend = mYstart + nyUse - 1; nxPad = (int)B3DMAX(16, padFrac * nxUse / binAlign); nyPad = (int)B3DMAX(16, padFrac * nyUse / binAlign); divisor = 2; alignXpad = nxUse / binAlign; alignYpad = nyUse / binAlign; if (gpuFlags & GPU_FOR_ALIGNING) { divisor = NICE_GPU_DIVISOR; alignXpad = divisor * ((alignXpad + divisor - 1) / divisor); alignYpad = divisor * ((alignYpad + divisor - 1) / divisor); } alignXpad = niceFrame(alignXpad, divisor, niceLimit); alignYpad = niceFrame(alignYpad, divisor, niceLimit); } if (mDebug) utilPrint("fullXpad = %d fullYpad = %d alignXpad = %d alignYpad = %d\n", fullXpad, fullYpad, alignXpad, alignYpad); // Clean up extra arrays if no trimming now and there was previously if ((mTrimFrac || mTaperFrac) && !doBinPad) { for (ind = 0; ind < (int)mSavedBinPad.size(); ind++) B3DFREE(mSavedBinPad[ind]); mSavedBinPad.clear(); B3DFREE(mCorrBinPad); } // Allocate the align arrays if needed mAlignPix = (alignXpad + 2) * alignYpad; mAlignBytes = mAlignPix * sizeof(float); if (alignXpad != mAlignXpad || alignYpad != mAlignYpad || numAllVsAll != mNumAllVsAll) { B3DFREE(mAlignSum); mAlignSum = B3DMALLOC(float, mAlignPix); B3DFREE(mWorkBinPad) mWorkBinPad = B3DMALLOC(float, mAlignPix); B3DFREE(mCorrBinPad); mCorrBinPad = B3DMALLOC(float, mAlignPix); if (testAndCleanup(!mAlignSum || !mWorkBinPad || !mSubFiltMask || !mCorrBinPad)) return 2; for (ind = 0; ind < (int)mSavedBinPad.size(); ind++) B3DFREE(mSavedBinPad[ind]); mSavedBinPad.clear(); } aliFiltSize = B3DMAX(minFiltSize, 4 * (maxShift / binAlign)); ACCUM_MIN(aliFiltSize, alignXpad); ACCUM_MIN(aliFiltSize, alignYpad); aliFiltSize = niceFrame(aliFiltSize, 2, niceLimit); // Manage filter mask arrays if (numFilters != mNumFilters || alignXpad != mAlignXpad || alignYpad != mAlignYpad || aliFiltSize != mAliFiltSize) { B3DFREE(mFullFiltMask); mFullFiltMask = B3DMALLOC(float, mAlignPix); if (testAndCleanup(!mFullFiltMask)) return 2; B3DFREE(mCorrFiltTemp); B3DFREE(mTempSubFilt); B3DFREE(mWrapTemp); for (ind = 0; ind <= mNumFilters; ind++) B3DFREE(mSubFiltMask[ind]); if (!justFullFilt) { mCorrFiltTemp = B3DMALLOC(float, mAlignPix); mTempSubFilt = B3DMALLOC(float, (aliFiltSize + 2 ) * aliFiltSize); mWrapTemp = B3DMALLOC(float, (aliFiltSize + 2 ) * aliFiltSize); for (ind = 0; ind < numFilters; ind++) { mSubFiltMask[ind] = B3DMALLOC(float, (aliFiltSize + 2 ) * aliFiltSize); if (testAndCleanup(!mSubFiltMask[ind] || !mCorrFiltTemp || !mTempSubFilt || !mWrapTemp)) return 2; } } } mAlignXpad = alignXpad; mAlignYpad = alignYpad; // And construct the filter masks. Take square root of the full filter as it just // gets applied to each stored FFT before correlation XCorrSetCTF(sigma1, sigma2[0] * binAlign, radius1, !justFullFilt ? 0.71f : radius2[0] * binAlign, mFiltFunc, mAlignXpad, mAlignYpad, &mFiltDelta); for (ind = 0; ind < 8193; ind++) mFiltFunc[ind] = sqrt(mFiltFunc[ind]); for (ind = 0; ind < mAlignPix; ind++) mFullFiltMask[ind] = 1.; XCorrFilterPart(mFullFiltMask, mFullFiltMask, alignXpad, alignYpad, mFiltFunc, mFiltDelta); mFullFiltMask[0] = 0.; /*for (ind = 0; ind < alignXpad + 2; ind++) mFullFiltMask[ind] = 0.; for (ind = 0; ind < alignYpad; ind++) mFullFiltMask[ind * (alignXpad + 2)] = 0.;*/ for (filt = 0; filt < numFilters && !justFullFilt; filt++) { XCorrSetCTF(0., sigma2[filt] * binAlign, 0., radius2[filt] * binAlign, mFiltFunc, aliFiltSize, aliFiltSize, &mFiltDelta); for (ind = 0; ind < (aliFiltSize + 2 ) * aliFiltSize; ind++) mSubFiltMask[filt][ind] = 1.; XCorrFilterPart(mSubFiltMask[filt], mSubFiltMask[filt], aliFiltSize, aliFiltSize, mFiltFunc, mFiltDelta); } // Now manage the reduction temp array if (binSum > 1) { xReduceTemp = B3DMAX(sumXpad, xReduceTemp); yReduceTemp = B3DMAX(sumYpad, yReduceTemp); } if (!doBinPad) { xReduceTemp = B3DMAX(alignXpad, xReduceTemp); yReduceTemp = B3DMAX(alignYpad, yReduceTemp); } if (xReduceTemp != mXreduceTemp || yReduceTemp != mYreduceTemp) { B3DFREE(mReduceTemp); if (xReduceTemp) { mReduceTemp = B3DMALLOC(float, (xReduceTemp + 2) * yReduceTemp); if (testAndCleanup(mReduceTemp == NULL)) return 2; } } mXreduceTemp = xReduceTemp; mYreduceTemp = yReduceTemp; mDeferSumming = (cumAlignAtEnd || (!useHybrid && numFilters > 1) || deferSum) && summingMode == 0; // Do not use GPU for group size above the limit // If GPU was used and is not going to be, clean it up if (gpuFlags && groupSize > 5 && (gpuFlags & GPU_FOR_ALIGNING) != 0) gpuFlags = 0; if (mGpuFlags && !gpuFlags) sFgpuCleanup(); // See which GPU components are not being used, and clear the component that is not // needed immediately mGpuAligning = (gpuFlags & GPU_FOR_ALIGNING) != 0 && summingMode >= 0; mGpuSumming = (gpuFlags & GPU_FOR_SUMMING) != 0 && summingMode <= 0; noisePadOnGpu = doBinPad && mGpuSumming && (gpuFlags & GPU_DO_NOISE_TAPER); binPadOnGpu = doBinPad && mGpuAligning && (gpuFlags & GPU_DO_BIN_PAD); stackOnGpu = noisePadOnGpu && binPadOnGpu && (gpuFlags & STACK_FULL_ON_GPU); gpuStackLimit = (gpuFlags >> GPU_STACK_LIM_SHIFT) & GPU_STACK_LIM_MASK; mNumExpectedFrames = expectedZ; expectStack = B3DMIN(numAllVsAll, expectedZ); if (cumAlignAtEnd) expectStack = expectedZ; if (gpuFlags) { mFlagsForUnpadCall = (noisePadOnGpu ? GPU_DO_NOISE_TAPER : 0) | (binPadOnGpu ? GPU_DO_BIN_PAD : 0) | (stackOnGpu ? STACK_FULL_ON_GPU : 0) | ((stackOnGpu && gpuStackLimit > 0) ? GPU_STACK_LIMITED : 0) | (gpuFlags & (GPU_AVG_SUPER_2X | GPU_AVG_SUPER_4X)); sFgpuSetUnpaddedSize(nx, ny, mFlagsForUnpadCall, (mDebug ? 1 : 0) + (mReportTimes ? 10 : 0)); if (!mGpuSumming || mDeferSumming) sFgpuCleanSumItems(); if (!mGpuAligning) sFgpuCleanAlignItems(); // Set up aligning unconditionally if (mGpuAligning && sFgpuSetupAligning (alignXpad, alignYpad, mGpuSumming ? sumXpad : 0, mGpuSumming ? sumYpad : 0, mFullFiltMask, aliFiltSize, groupSize, expectStack, cumAlignAtEnd)) gpuFlags = 0; // Set up summing unless it is deferred mEvenOddForSumSetup = ((gpuFlags & GPU_DO_EVEN_ODD) ? 1 : 0) + (((gpuFlags & GPU_DO_UNWGT_SUM) != 0 && makeUnwgtSum) ? 2 :0); if (gpuFlags && mGpuSumming && !mDeferSumming && sFgpuSetupSumming(fullXpad, fullYpad, sumXpad, sumYpad, mEvenOddForSumSetup)) { sFgpuCleanup(); gpuFlags = 0; } } // Save all members for current state mGpuFlags = gpuFlags; mGpuAligning = (gpuFlags & GPU_FOR_ALIGNING) != 0 && summingMode >= 0; mGpuSumming = (gpuFlags & GPU_FOR_SUMMING) != 0 && summingMode <= 0; mNoisePadOnGpu = gpuFlags != 0 && noisePadOnGpu; mBinPadOnGpu = gpuFlags != 0 && binPadOnGpu; mStackUnpadOnGpu = gpuFlags != 0 && stackOnGpu; mUnwgtOnGpu = mGpuSumming && (gpuFlags & GPU_DO_UNWGT_SUM) && makeUnwgtSum; // Set this to 0 if no stacking, but it also has to be 0 to indicate no limit mGpuStackLimit = mStackUnpadOnGpu ? gpuStackLimit : 0; mNumStackedOnGpu = 0; mNumFullSaved = 0; if (gpuFlags) sFgpuZeroTimers(); mGroupSize = groupSize; mGroupSizeInitial = groupSize; mTrimFrac = trimFrac; mTaperFrac = taperFrac; mNumFrames = 0; mBinSum = binSum; mBinAlign = binAlign; mNumAllVsAll = numAllVsAll; mMaxShift = maxShift; mNumFilters = numFilters; mSummingMode = summingMode; mNumFits = 0; mKfactor = kFactor; mMaxMaxWeight = maxMaxWeight; mBestFilt = 0; mPickedBestFilt = false; mUseHybrid = useHybrid; for (filt = 0; filt < numFilters; filt++) mNumAsBestFilt[filt] = 0; mMaxShift = maxShift; mCumAlignAtEnd = cumAlignAtEnd; mAliFiltSize = aliFiltSize; mNx = nx; mNy = ny; memset(mAlignSum, 0, mAlignBytes); if (summingMode <= 0) { memset(mFullEvenSum, 0, (sumXpad + 2) * sumYpad * sizeof(float)); memset(mFullOddSum, 0, (sumXpad + 2) * sumYpad * sizeof(float)); } if (mUnweightSum) memset(mUnweightSum, 0, (sumXpad + 2) * sumYpad * sizeof(float)); mWallFullFFT = mWallBinPad = mWallBinFFT = mWallReduce = mWallShift = 0.; mWallConjProd = mWallFilter = mWallPreProc = mWallNoise = 0.; mDoingDoseWeighting = false; CLEAR_RESIZE(mDoseWgtFilter, float, 0); CLEAR_RESIZE(mReweightFilt, float, 0); mDWFdelta = 0.; mMakeUnwgtSum = makeUnwgtSum; return 0; } /* * Store parameters and resize arrays for dose weighting, also save a reweighting * filter at this time. Pass NULL for reweightFilt if no reweighting is to be done; * pass an array with all 1's to have a normalizing reweighting computed here */ int FrameAlign::setupDoseWeighting(float priorDose, float *frameDoses, float pixelSize, float critScale, float aFac, float bFac, float cFac, float *reweightFilt, int &filtSize) { int ind, frame; bool allOnes = true; mDoingDoseWeighting = true; mPriorDoseCum = priorDose; CLEAR_RESIZE(mFrameDoses, float, mNumExpectedFrames); for (ind = 0; ind < mNumExpectedFrames; ind++) mFrameDoses[ind] = frameDoses[ind]; mPixelSize = pixelSize; mCritDoseScale = critScale; mCritDoseAfac = aFac; mCritDoseBfac = bFac; mCritDoseCfac = cFac; filtSize = 2 * B3DMAX(mFullXpad, mFullYpad); B3DCLAMP(filtSize, 1024, 8193); CLEAR_RESIZE(mDoseWgtFilter, float, filtSize); if (reweightFilt) { CLEAR_RESIZE(mReweightFilt, float, filtSize); for (ind = 0; ind < filtSize; ind++) { mReweightFilt[ind] = reweightFilt[ind]; if (reweightFilt[ind] != 1.0) allOnes = false; } // If the reweight filter is all ones, compute a normalizing filter here from the // inverse of the sum of filters to be used if (allOnes) { for (ind = 0; ind < filtSize; ind++) mReweightFilt[ind] = 0.; for (frame = 0; frame < mNumExpectedFrames; frame++) { doseWeightFilter(priorDose, priorDose + mFrameDoses[frame], mPixelSize, mCritDoseAfac, mCritDoseBfac, mCritDoseCfac, mCritDoseScale, &mDoseWgtFilter[0], (int)mDoseWgtFilter.size(), 0.71f, &mDWFdelta); priorDose += mFrameDoses[frame]; for (ind = 0; ind < filtSize; ind++) mReweightFilt[ind] += mDoseWgtFilter[ind]; } for (ind = 0; ind < filtSize; ind++) { if (mReweightFilt[ind] > 0.) mReweightFilt[ind] = mNumExpectedFrames / mReweightFilt[ind]; } } } return 0; } /* Cleanup on failure of the given test */ int FrameAlign::testAndCleanup(bool failed) { if (!failed) return 0; cleanup(); return 2; } /* * Free all memory and reset sizes etc */ void FrameAlign::cleanup() { int ind; B3DFREE(mFullOddSum); B3DFREE(mFullEvenSum); B3DFREE(mUnweightSum); B3DFREE(mAlignSum); B3DFREE(mWorkBinPad); B3DFREE(mWorkFullSize); B3DFREE(mCorrBinPad); B3DFREE(mCorrFiltTemp); B3DFREE(mShiftTemp); B3DFREE(mLinePtrs); B3DFREE(mReduceTemp); B3DFREE(mFullFiltMask); B3DFREE(mTempSubFilt); B3DFREE(mWrapTemp); B3DFREE(mFitMat); B3DFREE(mFitWork); for (ind = 0; ind < (int)mSavedBinPad.size(); ind++) B3DFREE(mSavedBinPad[ind]); for (ind = 0; ind < (int)mSavedFullSize.size(); ind++) B3DFREE(mSavedFullSize[ind]); for (ind = 0; ind < (int)mSavedGroups.size(); ind++) B3DFREE(mSavedGroups[ind]); CLEAR_RESIZE(mSavedBinPad, float *, 0); CLEAR_RESIZE(mSavedFullSize, float *, 0); CLEAR_RESIZE(mSavedGroups, float *, 0); for (ind = 0; ind < MAX_FILTERS; ind++) B3DFREE(mSubFiltMask[ind]); mNumFilters = 0; mTrimFrac = 0.; mTaperFrac = 0.; mCumAlignAtEnd = 0; mAliFiltSize = 0; mUseHybrid = 0; mFullXpad = mFullYpad = 0; mAlignXpad = mAlignYpad - 0; mSumXpad = mSumYpad = 0; mXreduceTemp = mYreduceTemp = 0; mNumAllVsAll = -1; for (ind = 0; ind <= MAX_FILTERS; ind++) { CLEAR_RESIZE(mXshifts[ind], float, 0); CLEAR_RESIZE(mYshifts[ind], float, 0); if (ind < MAX_FILTERS) { CLEAR_RESIZE(mXallShifts[ind], float, 0); CLEAR_RESIZE(mYallShifts[ind], float, 0); } } if (mGpuFlags) sFgpuCleanup(); CLEAR_RESIZE(mFrameDoses, float, 0); CLEAR_RESIZE(mDoseWgtFilter, float, 0); mGpuFlags = 0; } /* * Find out if GPU is available */ int FrameAlign::gpuAvailable(int nGPU, float *memory, int debug) { int err = 0; #if defined(_WIN32) && defined(DELAY_LOAD_FGPU) int ind, lastErr = 0, lastSlash; // Try loading from current directory, from upper directory (for shrmemframe), and // from Shrmemframe directory because there seemed to be a point where GMS 3.42 // would not load from the current dir, but would from a subdir const char *prefixDirs[] = {"", "..\\", "Shrmemframe\\"}; *memory = 0.; if (mGpuLibLoaded == 0) return 0; if (mGpuLibLoaded < 0) { struct _stat statbuf; HMODULE thisModule = reinterpret_cast(&__ImageBase); TCHAR dllPath[MAX_PATH]; std::string absPathStr, hereStr; // Get the absolute path of the current plugin and strip off the plugin name if (GetModuleFileName(thisModule, &dllPath[0], MAX_PATH)) { absPathStr = dllPath; lastSlash = (int)absPathStr.rfind('\\'); if (lastSlash >= 0) absPathStr.resize(lastSlash); absPathStr += '\\'; } // Loop on the possibilities, first seeing if file exists, then trying to load it for (ind = 0; ind < sizeof(prefixDirs) / sizeof(const char *); ind++) { hereStr = absPathStr + prefixDirs[ind] + GPU_DLL_NAME; if (!_stat(hereStr.c_str(), &statbuf)) { sGpuModule = LoadLibrary(hereStr.c_str()); if (sGpuModule) break; lastErr = GetLastError(); utilPrint("Could not load %s (error %d)\n", hereStr.c_str(), lastErr); err = 1; } } // Give message on definite failure to load if (!sGpuModule) { if (err) #ifdef GMS_MAJOR_VERSION utilPrint("GPU is not directly available from SerialEMCCD plugin\n"); #else utilPrint("GPU is not available, could not load FrameGPU.dll\n"); #endif mGpuLibLoaded = 0; return 0; } GET_PROC(GpuAvailableType, sFgpuGpuAvailable, fgpuGpuAvailable); GET_PROC(SetUnpaddedSizeType, sFgpuSetUnpaddedSize, fgpuSetUnpaddedSize); GET_PROC(SetPreProcParamsType, sFgpuSetPreProcParams, fgpuSetPreProcParams); GET_PROC(SetBinPadParamsType, sFgpuSetBinPadParams, fgpuSetBinPadParams); GET_PROC(SetupSummingType, sFgpuSetupSumming, fgpuSetupSumming); GET_PROC(SetupAligningType, sFgpuSetupAligning, fgpuSetupAligning); GET_PROC(SetupDoseWeightingType, sFgpuSetupDoseWeighting, fgpuSetupDoseWeighting); GET_PROC(AddToFullSumType, sFgpuAddToFullSum, fgpuAddToFullSum); GET_PROC(ReturnSumsType, sFgpuReturnSums, fgpuReturnSums); GET_PROC(NewFilterMaskType, sFgpuReturnUnweightedSum, fgpuReturnUnweightedSum); GET_PROC(FgpuNoArgType, sFgpuCleanup, fgpuCleanup); GET_PROC(FgpuNoArgType, sFgpuRollAlignStack, fgpuRollAlignStack); GET_PROC(FgpuNoArgType, sFgpuRollGroupStack, fgpuRollGroupStack); GET_PROC(FgpuTwoIntType, sFgpuSubtractAndFilterAlignSum, fgpuSubtractAndFilterAlignSum); GET_PROC(NewFilterMaskType, sFgpuNewFilterMask, fgpuNewFilterMask); GET_PROC(ShiftAddToAlignSumType, sFgpuShiftAddToAlignSum, fgpuShiftAddToAlignSum); GET_PROC(CrossCorrelateType, sFgpuCrossCorrelate, fgpuCrossCorrelate); GET_PROC(ProcessAlignImageType, sFgpuProcessAlignImage, fgpuProcessAlignImage); GET_PROC(ReturnTwoIntType, sFgpuNumberOfAlignFFTs, fgpuNumberOfAlignFFTs); GET_PROC(ReturnAlignFFTsType, sFgpuReturnAlignFFTs, fgpuReturnAlignFFTs); GET_PROC(ReturnStackedFrameType, sFgpuReturnStackedFrame, fgpuReturnStackedFrame); GET_PROC(FgpuNoArgType, sFgpuCleanSumItems, fgpuCleanSumItems); GET_PROC(FgpuNoArgType, sFgpuCleanAlignItems, fgpuCleanAlignItems); GET_PROC(FgpuNoArgType, sFgpuZeroTimers, fgpuZeroTimers); GET_PROC(FgpuNoArgType, sFgpuPrintTimers, fgpuPrintTimers); GET_PROC(ReturnIntType, sFgpuClearAlignSum, fgpuClearAlignSum); GET_PROC(FgpuTwoIntType, sFgpuSumIntoGroup, fgpuSumIntoGroup); GET_PROC(SetGroupSizeType, sFgpuSetGroupSize, fgpuSetGroupSize); GET_PROC(ReturnIntType, sFgpuGetVersion, fgpuGetVersion); GET_PROC(SetPrintFuncType, sFgpuSetPrintFunc, fgpuSetPrintFunc); if (err) utilPrint("GPU is not available: %d functions failed to load from %s\n", err, GPU_DLL_NAME); if (!err && sFgpuGetVersion() != GPUFRAME_VERSION) { utilPrint("GPU is not available: FrameGPU version (%d) does not match " "framealign version (%d)\n", sFgpuGetVersion(), GPUFRAME_VERSION); err = 1; } if (err) { FreeLibrary(sGpuModule); mGpuLibLoaded = 0; return 0; } mGpuLibLoaded = 1; if (mPrintFunc) sFgpuSetPrintFunc(mPrintFunc); } #endif err = sFgpuGpuAvailable(nGPU, memory, debug); if (!err) utilPrint("GPU is not available%s\n", debug ? "" : "; run with debugging output for details"); return err; } /* * MACROS for gain normalization and trunction */ #define NORM_TRUNC(a, b) \ case a: \ for (ix = 0; ix < nxt; ix++) { \ val = (b) * gainp[ix]; \ if (val > truncLimit) \ val = CorDefSurroundingMean(frame, type, nxt, nyt, truncLimit, ix, iy); \ fOut[base + ix] = val; \ } \ break; #define NORM_ONLY(a, b) \ case a: \ for (ix = 0; ix < nxt; ix++) { \ val = (b) * gainp[ix]; \ fOut[base + ix] = val; \ } \ break; #define TRUNC_ONLY(a, b) \ case a: \ for (ix = 0; ix < nxt; ix++) { \ val = (b); \ if (val > truncLimit) \ val = CorDefSurroundingMean(frame, type, nxt, nyt, truncLimit, ix, iy); \ fOut[base + ix] = val; \ } \ break; #define JUST_COPY(a, b) \ case a: \ for (ix = 0; ix < nxt; ix++) \ fOut[base + ix] = (b); \ break; /* * Function to preprocess an image with gain normalization, truncation and defect * correction */ void FrameAlign::preProcessFrame(void *frame, void *darkRef, int defBin, float *fOut) { // OpenMP does not allow member variables to be shared int nxGain = mNxGain; float truncLimit = mTruncLimit; int type = mFrameType; int base = 0; float *gainRef = mGainRef; int numThreads, maxThreads, gainXoff, gainYoff, ix = 0, iy, top, left, bottom, right; float val = 0.; float *gainp = NULL; unsigned char *bFrame = (unsigned char *)frame; short *sFrame = (short *)frame; unsigned short *usFrame = (unsigned short *)frame; float *fFrame = (float *)frame; short *sDark = (short *)darkRef; unsigned short *usDark = (unsigned short *)darkRef; if (mGainRef) { gainXoff = (mNxGain - mNx) / 2; gainYoff = (mNyGain - mNy) / 2; } // All processing converts input to a float array: gain/trunction place it in float // otherwise it gets copied to float; then defect operates float->float maxThreads = 1; if (truncLimit > 0 && gainRef) maxThreads = 6; else if (truncLimit > 0 || gainRef) maxThreads = 3; numThreads = numOMPthreads(maxThreads); int nxt = mNx; int nyt = mNy; #pragma omp parallel for num_threads(numThreads) \ shared(nxt, nyt, gainRef, nxGain, gainXoff, gainYoff, bFrame, sFrame, usFrame, fFrame, \ fOut, sDark, usDark, type, frame, truncLimit) \ private(base, iy, ix, gainp, val) for (iy = 0; iy < nyt; iy++) { base = iy * nxt; if (gainRef) { gainp = &gainRef[(iy + gainYoff) * nxGain + gainXoff]; if (darkRef && truncLimit > 0) { // Dark and gain with truncation switch (type) { NORM_TRUNC(MRC_MODE_BYTE, bFrame[base + ix] - sDark[base + ix]); NORM_TRUNC(MRC_MODE_SHORT, sFrame[base + ix] - sDark[base + ix]); NORM_TRUNC(MRC_MODE_USHORT, usFrame[base + ix] - usDark[base + ix]); NORM_TRUNC(MRC_MODE_FLOAT, fFrame[base + ix] - sDark[base + ix]); } } else if (truncLimit > 0) { // Gain norm with trunction switch (type) { NORM_TRUNC(MRC_MODE_BYTE, bFrame[base + ix]); NORM_TRUNC(MRC_MODE_SHORT, sFrame[base + ix]); NORM_TRUNC(MRC_MODE_USHORT, usFrame[base + ix]); NORM_TRUNC(MRC_MODE_FLOAT, fFrame[base + ix]); } } else if (darkRef) { // Dark and Gain without truncation switch (type) { NORM_ONLY(MRC_MODE_BYTE, bFrame[base + ix] - sDark[base + ix]); NORM_ONLY(MRC_MODE_SHORT, sFrame[base + ix] - sDark[base + ix]); NORM_ONLY(MRC_MODE_USHORT, usFrame[base + ix] - usDark[base + ix]); NORM_ONLY(MRC_MODE_FLOAT, fFrame[base + ix] - sDark[base + ix]); } } else { // Gain norm without truncation switch (type) { case MRC_MODE_BYTE: for (ix = 0; ix < nxt; ix++) { val = bFrame[base + ix] * gainp[ix]; fOut[base + ix] = val; } break; // NORM_ONLY(MRC_MODE_BYTE, bFrame[base + ix]); NORM_ONLY(MRC_MODE_SHORT, sFrame[base + ix]); NORM_ONLY(MRC_MODE_USHORT, usFrame[base + ix]); NORM_ONLY(MRC_MODE_FLOAT, fFrame[base + ix]); } } } else if (truncLimit > 0) { // Truncation only switch (type) { TRUNC_ONLY(MRC_MODE_BYTE, bFrame[base + ix]); TRUNC_ONLY(MRC_MODE_SHORT, sFrame[base + ix]); TRUNC_ONLY(MRC_MODE_USHORT, usFrame[base + ix]); TRUNC_ONLY(MRC_MODE_FLOAT, fFrame[base + ix]); } } else { // Or copying to the float array for defect correction switch (type) { JUST_COPY(MRC_MODE_BYTE, bFrame[base + ix]); JUST_COPY(MRC_MODE_SHORT, sFrame[base + ix]); JUST_COPY(MRC_MODE_USHORT, usFrame[base + ix]); JUST_COPY(MRC_MODE_FLOAT, fFrame[base + ix]); } } } // Defect correction: pass one past edge on right and bottom if (mCamSizeX > 0) { left = (mCamSizeX / mDefectBin- mNx) / 2; top = (mCamSizeY / mDefectBin - mNy) / 2; right = left + mNx; bottom = top + mNy; CorDefCorrectDefects(mCamDefects, fOut, MRC_MODE_FLOAT, defBin, top, left, bottom, right); } } /* * Operate on the next frame */ int FrameAlign::nextFrame(void *frame, int type, float *gainRef, int nxGain, int nyGain, void *darkRef, float truncLimit, CameraDefects *defects, int camSizeX, int camSizeY, int defBin, float shiftX, float shiftY) { bool saving = (mNumAllVsAll > 0 || mCumAlignAtEnd || mDeferSumming) && mSummingMode >=0; bool savingFull = (saving || (!mNumAllVsAll && mStackUnpadOnGpu)) && mSummingMode <= 0; bool doBinPad = mTaperFrac != 0. || mTrimFrac != 0.; bool needPreprocess = gainRef || truncLimit > 0 || camSizeX > 0; bool preprocOnGpu = (mGpuFlags & GPU_DO_PREPROCESS) && needPreprocess && !darkRef; bool preprocHere = needPreprocess && ((!mNoisePadOnGpu && mSummingMode <= 0) || (!mBinPadOnGpu && mSummingMode >= 0) || !preprocOnGpu); bool stackOnGpu; float *binArr = mWorkBinPad; float *fullArr = mWorkFullSize; float *tempBin, *groupArr; int nxBin, nyBin, nxTaper, nyTaper, ind, err, ref, xOffset, yOffset, useInd; int ix, filt, useFilt, stackByteSize; float xShift, yShift, nearXshift = 0., nearYshift = 0.; bool needExtract = true; bool didNoisePad = false; bool addToFull = mSummingMode < 0 || (mSummingMode == 0 && !mDeferSumming); bool filterSubarea = mCumAlignAtEnd || (mGpuFlags & GPU_FOR_ALIGNING) != 0; int useType = type; int numFrameForAVA = mNumAllVsAll + mGroupSize - 1; int numBinPadForAVA = mGroupSize > 1 ? mGroupSize : mNumAllVsAll; int sumFrameInd = mDeferSumming ? mNumFrames : B3DMIN(mNumFrames, numFrameForAVA - 1); int aliFrameInd = mCumAlignAtEnd ? mNumFrames : B3DMIN(mNumFrames, numBinPadForAVA - 1); int groupInd = B3DMIN(mNumFrames + 1 - mGroupSize, mNumAllVsAll - 1); int nxDimForBP = mNx; int nxForBP = mNx, nyForBP = mNy; float *useFrame = (float *)frame; unsigned char *defectMap = NULL; CameraDefects gpuDefects; int camSizeXforGPU, camSizeYforGPU; // Save these as member variables to allow frame recovery from GPU mNoiseLength = B3DMAX(mNx, mNy) / 50; B3DCLAMP(mNoiseLength, 20, 120); mDefectBin = defBin; mFrameType = type; mNxGain = nxGain; mNyGain = nyGain; mGainRef = gainRef; mCamDefects = defects; mCamSizeX = camSizeX; mCamSizeY = camSizeY; mTruncLimit = truncLimit; // PROCESS ALL-VS-ALL RESULT FIRST if (mNumAllVsAll && mNumFrames > mGroupSize - 1 && mSummingMode >= 0) findAllVsAllAlignment(mNumFrames < numFrameForAVA); if (mNumAllVsAll && mNumFrames >= numFrameForAVA && mSummingMode >= 0) { for (filt = 0; filt <= mNumFilters; filt++) { useFilt = filt == mNumFilters ? mBestFilt : filt; // First time, just set the first two shifts and add in 0 if (mNumFrames == numFrameForAVA) { mXshifts[filt].push_back(mXfitShifts[useFilt][0]); mYshifts[filt].push_back(mYfitShifts[useFilt][0]); mXshifts[filt].push_back(mXfitShifts[useFilt][1]); mYshifts[filt].push_back(mYfitShifts[useFilt][1]); if (addToFull && filt == mNumFilters && addToSums(NULL, 0, -9, 0)) { cleanup(); return 2; } } else { // Later, get mean difference between last and this set of shifts and adjust adjustAndPushShifts(1, filt, useFilt); } if (addToFull && filt == mNumFilters && addToSums (NULL, 0, -9, mNumFrames + 1 - numFrameForAVA)) { cleanup(); return 2; } // Save the shifts for (ind = 0; ind < mNumAllVsAll; ind++) { mLastXfit[filt][ind] = mXfitShifts[useFilt][ind]; mLastYfit[filt][ind] = mYfitShifts[useFilt][ind]; } // Shift the all-vs-all matrix down if (filt < mNumFilters) { for (ref = 1; ref < mNumAllVsAll - 1; ref++) { for (ind = ref + 1; ind < mNumAllVsAll; ind++) { mXallShifts[filt][(ref - 1) * mNumAllVsAll + ind - 1] = mXallShifts[filt][ref * mNumAllVsAll + ind]; mYallShifts[filt][(ref - 1) * mNumAllVsAll + ind - 1] = mYallShifts[filt][ref * mNumAllVsAll + ind]; } } } } // No need to roll saved sum array conditionally here, rolling happens in addToSum } // Roll the saved align array if no cumulative alignment, roll groups unconditionally if (mNumAllVsAll && mNumFrames >= numBinPadForAVA && mSummingMode >= 0) { if (mGpuAligning) { if (!mCumAlignAtEnd) sFgpuRollAlignStack(); if (mGroupSize > 1 && mNumFrames >= mNumAllVsAll + mGroupSize - 1) sFgpuRollGroupStack(); } else { if (!mCumAlignAtEnd) utilRollSavedFrames(mSavedBinPad, numBinPadForAVA); if (mGroupSize > 1 && mNumFrames >= mNumAllVsAll + mGroupSize - 1) { utilRollSavedFrames(mSavedGroups, mNumAllVsAll); } } } stackOnGpu = mStackUnpadOnGpu && (!mGpuStackLimit || mNumStackedOnGpu < mGpuStackLimit); // For first frame, set up pre-processing params on GPU; do this here so it is easy to // fall back to CPU entirely if failure if (!mNumFrames && mGpuFlags > 0) { err = 0; if (camSizeX > 0 && preprocOnGpu) { camSizeXforGPU = camSizeX; camSizeYforGPU = camSizeY; gpuDefects = *mCamDefects; defectMap = B3DMALLOC(unsigned char, mNx * mNy); if (!defectMap) { err = 1; preprocOnGpu = false; } // Scale the defects down if they are scaled for K2 and defect binning value is 2 if (!err && gpuDefects.K2Type > 0 && gpuDefects.wasScaled > 0 && defBin > 1) { CorDefScaleDefectsForK2(&gpuDefects, true); camSizeXforGPU /= 2; camSizeYforGPU /= 2; } CorDefFillDefectArray(&gpuDefects, camSizeXforGPU, camSizeYforGPU, defectMap, mNx, mNy, true); } // It tests for both defectMap and camSizeX, so no need to make it 0 if not preproc if (sFgpuSetPreProcParams(preprocOnGpu ? gainRef : NULL, nxGain, nyGain, preprocOnGpu ? truncLimit : 0.f, defectMap, camSizeX, camSizeY)) err = 1; free(defectMap); // Fallback to doing all preproc and prep steps on CPU if there was an error if (err) { stackOnGpu = preprocOnGpu = false; preprocHere = needPreprocess; cancelInitialStepsOnGPU(); } } // If noise padding is done here, floats will be saved, but if it is done on GPU // and either no preprocessing happens or it happens there, then save raw images mStackType = MRC_MODE_FLOAT; if (mNoisePadOnGpu && (preprocOnGpu || !needPreprocess)) { mStackType = type; } dataSizeForMode(mStackType, &stackByteSize, &ix); // Substitute the save arrays for the working ones; create new and push if needed if (saving) { if (mNumAllVsAll && !mGpuAligning) { if (aliFrameInd < (int)mSavedBinPad.size()) { binArr = mSavedBinPad[aliFrameInd]; } else { binArr = B3DMALLOC(float, mAlignPix); if (testAndCleanup(!binArr)) return 1; mSavedBinPad.push_back(binArr); } } if (savingFull && !stackOnGpu) { if (mNumFullSaved < (int)mSavedFullSize.size()) { fullArr = mSavedFullSize[mNumFullSaved]; mSavedFullFrameNum[mNumFullSaved] = mNumFrames; } else { fullArr = (float *)malloc(stackByteSize * (mFullXpad + 2) * mFullYpad); if (testAndCleanup(!fullArr)) return 1; mSavedFullSize.push_back(fullArr); mSavedFullFrameNum.push_back(mNumFrames); } mNumFullSaved++; } } /* Data flow for cases: fullArr is mWorkBinPad unless saving full images here, then it is in mSavedFullSize Have float in useFrame = fullArr after pre-preprocessing, otherwise useType in frame Doing preproc on GPU makes sense only if both noisepad and binpad are to be done there, since otherwise I have to CPU preproc anyway ALIGNING ONLY with regular doBinPad - Not saving on stack, no NTP to think about - For binpad on GPU: - 1) No preproc: send frame/useFrame to align - 2) Preproc here: send useFrame to align - 3) Preproc on GPU: send frame to align - No binpad on GPU: - Send binArr as usual, prepared from fullArr - Preproc: produces floats in useFrame = mWorkFullSize - If preproc or input is floats: set fullArr to useFrame - No preproc and no floats: need taperoutpad to float in fullArr SUMMING ONLY - Should do NTP on GPU - If no NTP here pass useFrame of form useType for summing - set fullArr to useFrame to send to summing ALIGNING AND SUMMING for sure: No binpad or noisepad on GPU: - No regular doBinPad: - Does NTP to float in fullArr - Regular doBinPad: NTP to float in fullArr NTP on GPU only, no stack on GPU: - Want to save unpadded image, but there has to be floats to get the binpad - Have to do PreProc here for binpad - 1) No preproc: - Store input array - If already floats, memcpy useFrame (frame) to fullArr - If NOT floats, memcpy useFrame to fullArr then set fullArr back to mWorkFullSize - It will then taperoutpad to float in mWorkFullSize - 2) Preproc, not on GPU: - Need to store preproc array as floats and pass to GPU later. It goes to fullArr - 3) Preproc here and on GPU: - Should only do this if input not floats - Store input array: stackType = type - memcpy frame to fullArr then set fullArr back to mWorkFullSize - No taperoutpad; useType is floats - addToSum must pass unpadded stackType array to get NTP'd - Recovery requires NTP of frames stacked here, with possible preproc for case 3) Binpad on GPU only (never stack unpad on GPU): - It needs the unpadded image to go to the GPU - But it wants to do the NTP here which will make floats - Have to do Preproc here for NTP - Could send raw image if there is space for preproc arrays in GPU - All of these stack NTP'd array as usual - 1) No preproc - Send raw image in frame, in type form - This is the same as useFrame, in useType form - 2) Preproc only here: - Send unpadded image in useFrame, in useType form - But need to preproc into mWorkFullSize not fullArr - 3) preproc in both places, only if input not float (no sense for float) - Send raw image in frame, in type form Both on GPU, no stack on GPU - It needs the unpadded image to go to the GPU for both operations - It can do preproc twice there to save doing it here (if space and if not float) - 1) No preproc - The original unpadded image needs to be stacked here (frame in type form) - This is the same as useFrame, in useType form - memcpy frame/useFrame to fullArr, what about set fullArr back to mWorkFullSize? - 2) Preproc only here: - Send unpadded image in useFrame, in useType form - Preproc into fullArr as usual, into stack - 3) Preproc on GPU: - The image for aligning must be preproc'd and bin-padded by GPU - addToSum must pass unpadded original type array to get PreProc'd and NTP'd - Since no preproc done, this is the same as useFrame, in useType form - memcpy frame/useFrame to fullArr, what about set fullArr back to mWorkFullSize? - Recovery requires NTP and possible preproc of frames stacked here Both on GPU, stack on GPU: - It needs to send the original unpadded image to be stacked AND binpadded - Send the same place and way as with no stack, but with flag to stack it - 1) No preproc - Send frame/useFrame in type/useType form to align and stack - 2) Preproc only here: - Send unpadded image in useFrame, in useType form - 3) Preproc on GPU: - Send frame/useFrame in type/useType form to align and stack - addToSum must indicate which frame on stack to use - Recovery requires retrieving frames, putting in stack on front or end, and preproc and NTP of all frames */ // Now set flags for all these cases as needed bool needTaperOut = false, setFullToUse = false, copyToStackHere = false; bool setFullToWork = false, copyRawInput = false, sendRawToBinPad = false; bool preprocIntoWork = false; if (mSummingMode > 0) { // Aligning only if (doBinPad) { if (mBinPadOnGpu) { sendRawToBinPad = preprocOnGpu; } else { setFullToUse = needPreprocess || type == MRC_MODE_FLOAT; needTaperOut = !setFullToUse; } } } else if (mSummingMode < 0) { // summing only setFullToUse = mNoisePadOnGpu; } else { // Align and sum if (!mBinPadOnGpu && !mNoisePadOnGpu) { copyToStackHere = !needPreprocess && type == MRC_MODE_FLOAT; } else if (!mBinPadOnGpu && mNoisePadOnGpu) { if (!needPreprocess) { copyToStackHere = true; setFullToWork = true; needTaperOut = true; } else if (preprocOnGpu) { copyToStackHere = true; copyRawInput = true; setFullToWork = true; } } else if (mBinPadOnGpu && !mNoisePadOnGpu) { sendRawToBinPad = preprocOnGpu || !needPreprocess; preprocIntoWork = preprocHere && !preprocOnGpu; } else if (mBinPadOnGpu && mNoisePadOnGpu && !stackOnGpu) { if (!needPreprocess) { copyToStackHere = true; } else if (preprocOnGpu) { copyToStackHere = true; } } } if (doBinPad) { nxBin = (mXend + 1 - mXstart) / mBinAlign; nyBin = (mYend + 1 - mYstart) / mBinAlign; nxTaper = (int)(mTaperFrac * nxBin); nyTaper = (int)(mTaperFrac * nyBin); } // PRE-PROCESS IF ANY // Namely if there is gain reference, truncation, or defect correction // If using GPU, set parameters to do the pre-proc, bin-pad and regular processing, // and possible stacking there. This is where it finds out the stack type if (mGpuFlags) { sFgpuSetBinPadParams(mXstart, mXend, mYstart, mYend, mBinAlign, nxTaper, nyTaper, (preprocHere && !sendRawToBinPad) ? MRC_MODE_FLOAT : type, mAntiFiltType, mNoiseLength); } // Copy into stack now if it is to be raw if (copyToStackHere && copyRawInput) { dataSizeForMode(type, &useInd, &ix); memcpy(fullArr, frame, useInd * mNx * mNy); if (setFullToWork) fullArr = mWorkFullSize; } // Otherwise process here if needed if (preprocHere) { // Substitute source pointer and type useFrame = preprocIntoWork ? mWorkFullSize : fullArr; useType = MRC_MODE_FLOAT; START_TIMER; preProcessFrame(frame, darkRef, defBin, useFrame); ADD_TIME(mWallPreProc); //utilDumpImage(fullArr, mNx, mNx, mNy, 0, "processed original"); } if (mBinPadOnGpu) binArr = sendRawToBinPad ? (float *)frame : useFrame; // Now copy data into stack if flag is set if (copyToStackHere && !copyRawInput) { dataSizeForMode(useType, &useInd, &ix); memcpy(fullArr, useFrame, useInd * mNx * mNy); if (setFullToWork) fullArr = mWorkFullSize; } // PROCESS THE CURRENT IMAGE: Get the padded full image if (mBinPadOnGpu) { err = sFgpuProcessAlignImage(binArr, saving ? aliFrameInd : -1, groupInd, savingFull && stackOnGpu); if (err) { // If doing noise pad on GPU then need to possibly get stack back from GPU, and // also preprocess and noise pad it to be like normal ACPU stack here if (mNoisePadOnGpu) { if (recoverGpuFullStack(savingFull && stackOnGpu, &binArr)) { cleanup(); return 3; } didNoisePad = true; fullArr = binArr; binArr = mWorkBinPad; stackOnGpu = false; } cancelInitialStepsOnGPU(); setFullToUse = false; // But if it was a downstream error, now cancel aligning as well if (err == 1) { if (recoverGpuAlignFFTs(saving, aliFrameInd, mNumAllVsAll ? NULL : mAlignSum, saving ? NULL : mWorkBinPad, NULL, false)) return 3; binArr = mSavedBinPad[aliFrameInd]; } } else if (stackOnGpu) mNumStackedOnGpu++; } if ((mSummingMode <= 0 && !mNoisePadOnGpu) || !doBinPad) { // Big artifacts from the taper method. //sliceTaperOutPad(useFrame, useType, mNx, mNy, fullArr, mFullXpad + 2, mFullXpad, //mFullYpad, 0, 0.); // This produced floats into the full array START_TIMER; if (!didNoisePad) sliceNoiseTaperPad(useFrame, useType, mNx, mNy, fullArr, mFullXpad + 2, mFullXpad, mFullYpad, mNoiseLength, 4, mShiftTemp); ADD_TIME(mWallNoise); // Set larger size for bin/pad operation to come from this array nxDimForBP = mFullXpad + 2; nxForBP = mFullXpad; nyForBP = mFullYpad; } else if (needTaperOut) { // This simply converts to float array with no taper/pad // This can't be speeded up even though copyToCenter looks suspicious. Tried copying // forward in a simple function and it was actually twice as fast to copy backwards START_TIMER; sliceTaperOutPad(useFrame, useType, mNx, mNy, fullArr, mNx, mNx, mNy, 0, 0.); ADD_TIME(mWallNoise); } else if (setFullToUse) { // And if it is float, just assign it to replace the work array fullArr = useFrame; } //utilDumpImage(fullArr, nxDimForBP, nxForBP, nyForBP, 0, "padded tapered original"); // If doing trim or taper, bin the subarea and taper inside, take the FFT if (doBinPad && mSummingMode >= 0 && !mBinPadOnGpu) { xOffset = (nxForBP - mNx) / 2; yOffset = (nyForBP - mNy) / 2; // Use zoomdown routine for binning, it is a lot faster // Have to select the filter again because reading EER selects filter too START_TIMER; if (mBinAlign > 1) { for (ind = 0; ind < nyForBP; ind++) mLinePtrs[ind] = (unsigned char *)(fullArr + ind * nxDimForBP); if (selectZoomFilter(mAntiFiltType, 1. / mBinAlign, &ind) || !zoomWithFilter(mLinePtrs, nxForBP, nyForBP, (float)(mXstart + xOffset), (float)(mYstart + yOffset), nxBin, nyBin, nxBin, 0, MRC_MODE_FLOAT, binArr, NULL, NULL)) needExtract = false; } if (needExtract) extractWithBinning(fullArr, MRC_MODE_FLOAT, nxDimForBP, mXstart + xOffset, mXend + xOffset, mYstart + yOffset, mYend + yOffset, mBinAlign, binArr, 0, &nxBin, &nyBin); //utilDumpImage(binArr, nxBin, nxBin, nyBin, 0, "binned"); sliceTaperInPad(binArr, MRC_MODE_FLOAT, nxBin, 0, nxBin - 1, 0, nyBin - 1, binArr, mAlignXpad + 2, mAlignXpad, mAlignYpad, nxTaper, nyTaper); //utilDumpImage(binArr, mAlignXpad + 2, mAlignXpad, mAlignYpad, 0, "padded tapered"); ADD_TIME(mWallBinPad);; } if (mGpuAligning && doBinPad && !mBinPadOnGpu) { // Take FFT and save in stack on GPU if needed; if there is an error here, try to // recover the stack or the sum and just turn off GPU aligning if (sFgpuProcessAlignImage(binArr, saving ? aliFrameInd : -1, groupInd, savingFull && stackOnGpu)) { if (recoverGpuAlignFFTs(saving, aliFrameInd, mNumAllVsAll ? NULL : mAlignSum, saving ? NULL : mWorkBinPad, &binArr, savingFull && stackOnGpu)) return 3; } else if (stackOnGpu) mNumStackedOnGpu++; } if (!mGpuAligning) { START_TIMER; todfftc(binArr, mAlignXpad, mAlignYpad, 0); ADD_TIME(mWallBinFFT); } // Take the full FFT if (!doBinPad || (mSummingMode <= 0 && !mGpuSumming)) { START_TIMER; todfftc(fullArr, mFullXpad, mFullYpad, 0); ADD_TIME(mWallFullFFT); } // If just summing, add into sum and return if (mSummingMode < 0) { mXshifts[0].push_back(shiftX); mYshifts[0].push_back(shiftY); mXshifts[1].push_back(shiftX); mYshifts[1].push_back(shiftY); if (addToSums(fullArr, -1, -9, mNumFrames)) { cleanup(); return 2; } mNumFrames++; return 0; } // Now if not doing taper/pad, reduce the FFT into the align array if (!doBinPad) { START_TIMER; fourierReduceImage(fullArr, mFullXpad, mFullYpad, binArr, mAlignXpad, mAlignYpad, 0., 0., mShiftTemp); ADD_TIME(mWallReduce); } // Apply full filter to the align array unless on GPU if (!mGpuAligning) { START_TIMER; for (ind = 0; ind < mAlignPix; ind++) binArr[ind] *= mFullFiltMask[ind]; ADD_TIME(mWallFilter); } // Make a new group if ready useInd = aliFrameInd; if (mGroupSize > 1 && mNumFrames >= mGroupSize - 1) { if (!mGpuAligning) { if (groupInd < (int)mSavedGroups.size()) { groupArr = mSavedGroups[groupInd]; } else { groupArr = B3DMALLOC(float, mAlignPix); if (testAndCleanup(!groupArr)) return 1; mSavedGroups.push_back(groupArr); } memset(groupArr, 0, mAlignBytes); for (ind = aliFrameInd + 1 - mGroupSize; ind <= aliFrameInd; ind++) { tempBin = mSavedBinPad[ind]; for (ix = 0; ix < mAlignPix; ix++) groupArr[ix] += tempBin[ix]; } //utilDumpFFT(groupArr, mAlignXpad, mAlignYpad, "group", 1, groupInd); } useInd = groupInd; } if (!mNumAllVsAll && mXshifts[0].size() > 0) { nearXshift = mXshifts[0].back(); nearYshift = mYshifts[0].back(); } if (mNumAllVsAll && mNumFrames + mGroupSize - 1 > 0) { // Align this frame with each previous frame, or nonoverlapping group ind = B3DMIN(mNumFrames + 1 - mGroupSize, mNumAllVsAll - 1); for (ref = 0; ref <= ind - mGroupSize; ref++) { for (filt = 0; filt < mNumFilters; filt++) { nearXshift = mXnearShifts[ind - 1] - mXnearShifts[ref]; nearYshift = mYnearShifts[ind - 1] - mYnearShifts[ref]; if (alignTwoFrames(useInd + ref - ind, useInd, nearXshift, nearYshift, filt, xShift, yShift, filterSubarea || mNumFilters > 1, mDumpCorrs)) { cleanup(); return 2; } mXallShifts[filt][ref * mNumAllVsAll + ind] = xShift; mYallShifts[filt][ref * mNumAllVsAll + ind] = yShift; if (mDebug > 1) utilPrint("%d to %d %.2f %.2f near %.2f %.2f\n", useInd, useInd + ref - ind, xShift, yShift, nearXshift, nearYshift); } } } else if (!mNumAllVsAll) { // Or align with the sum and add to the sums xShift = yShift = 0.; if (!mCumAlignAtEnd) useInd = -1; if (mNumFrames && alignTwoFrames(-1, useInd, nearXshift, nearYshift, 0, xShift, yShift, filterSubarea, mDumpCorrs)) { cleanup(); return 2; } mXshifts[0].push_back(xShift); mYshifts[0].push_back(yShift); mXshifts[1].push_back(xShift); mYshifts[1].push_back(yShift); if (mSummingMode <= 0 && !mDeferSumming && stackOnGpu) ix = 0; else ix = (mSummingMode <= 0 && !mDeferSumming) ? -1 : -9; if (addToSums((mSummingMode <= 0 && !mDeferSumming && !stackOnGpu) ? fullArr : NULL, ix, useInd, mNumFrames)) { cleanup(); return 2; } } // Increase the frame count after processing a frame mNumFrames++; return 0; } /* * Solve for the alignment of the current group of frames */ void FrameAlign::findAllVsAllAlignment(bool justForLimits) { int row, numData, numCol, numInCol, ref, ind, filt, allInd; int maxAsBest, indOfMax = -1, maxDrop, numSets, dropRun, numPred, indMaxDist = 0; bool pickable, failed[MAX_FILTERS]; int numFrames = B3DMIN(mNumFrames, mNumAllVsAll + mGroupSize - 1); int numGroups = numFrames + 1 - mGroupSize; float resMean[MAX_FILTERS], resSD[MAX_FILTERS], predMean[MAX_FILTERS]; float maxWgtRes[MAX_FILTERS], maxRaw[MAX_FILTERS]; int numFailed[MAX_FILTERS]; bool doRobust = numFrames >= 5 && mKfactor > 0.; float finalX = 0., finalY = 0., fracDrop, predSum; float errx, erry, resid, wgtResid, resSum, resSumSq, distFilt, dist0, weight; float minWgt = 2.f, minError = 1.e30f; float maxFitDist = 0., fitDist[MAX_FILTERS], errMeasure[MAX_FILTERS]; float smoothDist[MAX_FILTERS], maxSmoothDist = 0.; float distCrit = 0.75f; float notZeroCrit = 4.f; float absZeroCrit = (float)(0.15 * mBinAlign); float relZeroCrit = (float)(0.5 * mBinAlign); float closerRatio = 0.2f; FloatVec fullWeights; std::set dropSet; int indResMin = 0, indPredMin = 0; numCol = numGroups + 3; numInCol = numGroups - 1; // Evaluate failures of higher filters relative to lower ones for (filt = 0; filt < MAX_FILTERS; filt++) numFailed[filt] = 0; if (mNumFilters > 1) { for (ind = 1; ind < numGroups; ind++) { for (ref = 0; ref <= ind - mGroupSize; ref++) { allInd = ref * mNumAllVsAll + ind; dist0 = sqrt(mXallShifts[0][allInd] * mXallShifts[0][allInd] + mYallShifts[0][allInd] * mYallShifts[0][allInd]); if (dist0 > notZeroCrit) { for (filt = 1; filt < mNumFilters; filt++) { distFilt = sqrt(mXallShifts[filt][allInd] * mXallShifts[filt][allInd] + mYallShifts[filt][allInd] * mYallShifts[filt][allInd]); if (distFilt < absZeroCrit || (distFilt < relZeroCrit && distFilt < closerRatio * dist0)) numFailed[filt]++; } } } } if (mDebug > 1) utilPrint("numFailed %d %d %d %d %d\n", numFailed[1], numFailed[2], numFailed[3], numFailed[4], numFailed[5]); } maxAsBest =0; if (!mPickedBestFilt) mBestFilt = 0; numData = (numGroups + 1 - mGroupSize) * (numGroups - mGroupSize) / 2; doRobust = numData >= 2 * numGroups && mKfactor > 0.; for (filt = 0; filt < mNumFilters; filt++) { maxRaw[filt] = maxWgtRes[filt] = 0.; resSum = resSumSq = 0; if (mGroupSize > 1 && (numGroups < mGroupSize || numData < numGroups)) { for (ind = 0; ind < numGroups; ind++) mXnearShifts[ind] = mYnearShifts[ind] = 0.; for (ind = 1; ind < numGroups; ind++) { for (ref = 0; ref <= ind - mGroupSize; ref++) { if (mXnearShifts[ind] == 0.) { mXnearShifts[ind] = mXnearShifts[ref] + mXallShifts[0][ref * mNumAllVsAll + ind]; mYnearShifts[ind] = mYnearShifts[ref] + mYallShifts[0][ref * mNumAllVsAll + ind]; } } } continue; } else if (numGroups == 1) { // Deal with having only 1 or 2 frames mXfitShifts[filt][0] = 0.; mYfitShifts[filt][0] = 0.; mXnearShifts[0] = 0.; mYnearShifts[0] = 0.; continue; } else if (numGroups == 2) { mXfitShifts[filt][0] = -mXallShifts[filt][1] / 2.f; mYfitShifts[filt][0] = -mYallShifts[filt][1] / 2.f; mXfitShifts[filt][1] = mXallShifts[filt][1] / 2.f; mYfitShifts[filt][1] = mYallShifts[filt][1] / 2.f; if (!filt) { mXnearShifts[0] = -mXallShifts[0][1] / 2.f; mYnearShifts[0] = -mYallShifts[0][1] / 2.f; mXnearShifts[1] = mXallShifts[0][1] / 2.f; mYnearShifts[1] = mYallShifts[0][1] / 2.f; } continue; } // Otherwise, do the fitting dropSet.clear(); doRegression(doRobust, filt, filt, dropSet); // For first filter, copy to the shifts used for predictions if (!filt) { for (ind = 0; ind < numGroups; ind++) { mXnearShifts[ind] = mXfitShifts[0][ind]; mYnearShifts[ind] = mYfitShifts[0][ind]; } } if (justForLimits) continue; // Compute residuals row = 0; fitDist[filt] = 0.; fullWeights.clear(); for (ind = 1; ind < numGroups; ind++) { for (ref = 0; ref <= ind - mGroupSize; ref++) { allInd = ref * mNumAllVsAll + ind; errx = (mXfitShifts[filt][ind] - mXfitShifts[filt][ref]) - mXallShifts[filt][allInd]; erry = (mYfitShifts[filt][ind] - mYfitShifts[filt][ref]) - mYallShifts[filt][allInd]; resid = sqrt(errx * errx + erry * erry); weight = 1.; if (doRobust) weight = mFitMat[numCol * row + numInCol + 2]; wgtResid = resid * weight; ACCUM_MIN(minWgt, weight); fullWeights.push_back(weight); resSum += wgtResid; resSumSq += wgtResid * wgtResid; ACCUM_MAX(maxRaw[filt], resid); ACCUM_MAX(maxWgtRes[filt], wgtResid); row++; if (ind == ref + 1) fitDist[filt] += (float) sqrt(pow((double)mXfitShifts[filt][ind] - mXfitShifts[filt][ref], 2.) + pow((double)mYfitShifts[filt][ind] - mYfitShifts[filt][ref], 2.)); } } sumsToAvgSD(resSum, resSumSq, numData, &resMean[filt], &resSD[filt]); predMean[filt] = resMean[filt]; // Cross-validation for >= 4 groups if (numGroups >= 4) { // This fancy graded ratio doesn't really matter, it ends up jumping to 2 points // at 21 frames fracDrop = (float)((0.01 * numData) / numGroups); B3DCLAMP(fracDrop, 0.05f, 0.1f); maxDrop = B3DMAX(1, (int)(fracDrop * numData)); numSets = (numData + maxDrop - 1) / maxDrop; numPred = 0; predSum = 0.; // Loop on the runs, set up the drop set for a run, and do regression for (dropRun = 0; dropRun < numSets; dropRun++) { dropSet.clear(); for (ind = dropRun; ind < numData; ind += numSets) dropSet.insert(ind); doRobust = numData - dropSet.size() >= 2 * numGroups && mKfactor > 0.; doRegression(doRobust, filt, MAX_FILTERS, dropSet); // Compute the leave-out error row = 0; for (ind = 1; ind < numGroups; ind++) { for (ref = 0; ref <= ind - mGroupSize; ref++) { if (dropSet.count(row)) { allInd = ref * mNumAllVsAll + ind; errx = (mXfitShifts[MAX_FILTERS][ind] - mXfitShifts[MAX_FILTERS][ref]) - mXallShifts[filt][allInd]; erry = (mYfitShifts[MAX_FILTERS][ind] - mYfitShifts[MAX_FILTERS][ref]) - mYallShifts[filt][allInd]; resid = sqrt(errx * errx + erry * erry); predSum += (doRobust ? fullWeights[row] : 1.f) * resid; numPred++; } row++; } } } predMean[filt] = predSum / numPred; } // Maintain stats for this filter if (resMean[filt] < resMean[indResMin]) indResMin = filt; if (predMean[filt] < predMean[indPredMin]) indPredMin = filt; mPredMeanSum[filt] += predMean[filt]; mResMeanSum[filt] += resMean[filt]; mResSDsum[filt] += resSD[filt]; mResMaxSum[filt] += maxWgtRes[filt]; mRawMaxSum[filt] += maxRaw[filt]; ACCUM_MAX(mMaxResMax[filt], maxWgtRes[filt]); ACCUM_MAX(mMaxRawMax[filt], maxRaw[filt]); errMeasure[filt] = (1.f - mMaxMaxWeight) * predMean[filt] + mMaxMaxWeight * maxWgtRes[filt]; failed[filt] = numFailed[filt] >= B3DMAX(1, numFrames - 2); if (failed[filt]) mNumAsBestFilt[filt]--; smoothDist[filt] = 0.; if (mXshifts[filt].size() >= 3) smoothDist[filt] = smoothedTotalDistance(&mXshifts[filt][0], &mYshifts[filt][0], (int)mXshifts[filt].size(), dist0); if (maxFitDist < fitDist[filt]) { indMaxDist = filt; maxFitDist = fitDist[filt]; } ACCUM_MAX(maxSmoothDist, smoothDist[filt]); //PRINT3(filt, fitDist[filt], smoothDist[filt]); if (maxAsBest < mNumAsBestFilt[filt]) { indOfMax = filt; maxAsBest = mNumAsBestFilt[filt]; } // On the last fit, we now know the best filter and can manage the hybrid values if (filt == mNumFilters - 1) { //PRINT2(maxFitDist, maxSmoothDist); // Determine the best filter, discounting ones that have a much lower distance than // the maximum distance. This is // driven by assumption/finding that ones with lower distance were contaminated by // bad fits from fixed pattern noise, but it should only be applied when the filter // is higher than the one that gives the maximum distance. In the other direction // bigger distances reflect noise from overfitting. for (ind = 0; ind < mNumFilters; ind++) { if (!mPickedBestFilt && mNumAsBestFilt[ind] > -mFailedOftenCrit && !failed[ind] && errMeasure[ind] < minError && !(ind >= indMaxDist && fitDist[ind] < distCrit * maxFitDist) && !(maxAsBest >= mNumAsBestFilt[ind] + mPickDiffCrit && maxAsBest >= mPickRatioCrit * mNumAsBestFilt[ind])) { minError = errMeasure[ind]; mBestFilt = ind; } //utilPrint("%d %.3f ", mNumAsBestFilt[ind], errMeasure[ind]); } //PRINT2(mBestFilt, indOfMax); mResMeanSum[mNumFilters] += resMean[mBestFilt]; mPredMeanSum[mNumFilters] += predMean[mBestFilt]; mResSDsum[mNumFilters] += resSD[mBestFilt]; mResMaxSum[mNumFilters] += maxWgtRes[mBestFilt]; mRawMaxSum[mNumFilters] += maxRaw[mBestFilt]; ACCUM_MAX(mMaxResMax[mNumFilters], maxWgtRes[mBestFilt]); ACCUM_MAX(mMaxRawMax[mNumFilters], maxRaw[mBestFilt]); mNumFits++; if (!mPickedBestFilt && mNumFilters > 1 && mNumFrames >= mNumAllVsAll + mGroupSize -1) { mNumAsBestFilt[mBestFilt]++; // 3/15/22: Do not pick a filter at this point unless the hybrid solution is // actually going to be used pickable = mUseHybrid != 0; for (ind = 0; ind < mNumFilters; ind++) { if (ind != indOfMax && (maxAsBest < mNumAsBestFilt[ind] + mPickDiffCrit || maxAsBest < mPickRatioCrit * mNumAsBestFilt[ind])) pickable = false; } if (pickable) { if (mDebug) utilPrint("After %d frames, picking filter %d as best\n", mNumFrames, indOfMax +1); mBestFilt = indOfMax; mPickedBestFilt = true; } } if (numGroups >= 4 && mDebug) for (ind = 0; ind < mNumFilters; ind++) if ((ind == indPredMin || ind == indResMin) && indResMin != indPredMin) utilPrint("filt %d res %.3f%s pred %.3f%s\n", ind, resMean[ind], ind == indResMin ? "*":" ", predMean[ind], ind == indPredMin?"*":" "); } if (mDebug > 1) { utilPrint("%sresidual: mean = %.2f, SD = %.2f, max = %.2f, n = %d\n", doRobust ? "weighted " : "", resMean[filt], resSD[filt], maxWgtRes[filt], numGroups); if (doRobust) utilPrint(" unweighted max residual = %.2f, min weight = %.3f\n", maxRaw[filt], minWgt); } } } /* * Fill matrix for regression, optionally dropping the ones in the set, do robust or * regular regression, using values for filt but putting solution in fitInd */ void FrameAlign::doRegression(bool doRobust, int filt, int fitInd, std::set &dropSet) { int row, col, numData, numCol, numInCol, ref, ind, numIter, maxZeroWgt, allInd; float solMat[2 * MAX_ALL_VS_ALL], xMean[MAX_ALL_VS_ALL], xSD[MAX_ALL_VS_ALL]; float maxChange = 0.02f; float maxOscill = 0.05f; int maxIter = 50; int numFrames = B3DMIN(mNumFrames, mNumAllVsAll + mGroupSize - 1); int numGroups = numFrames + 1 - mGroupSize; float finalX = 0., finalY = 0.; // Load the data matrix with the correlations row = 0; numCol = numGroups + 3; numInCol = numGroups - 1; allInd = 0; for (ind = 1; ind < numGroups; ind++) { for (ref = 0; ref <= ind - mGroupSize; ref++) { if (dropSet.count(allInd++)) continue; mFitMat[numCol * row + numInCol] = mXallShifts[filt][ref * mNumAllVsAll + ind]; mFitMat[numCol * row + numInCol + 1] = mYallShifts[filt][ref * mNumAllVsAll + ind]; for (col = 0; col < numInCol; col++) mFitMat[numCol * row + col] = ((ind == numGroups - 1) ? -1.f : 0.f); mFitMat[numCol * row + ref] += -1.f; if (ind < numGroups - 1) mFitMat[numCol * row + ind] += 1.f; /*for (col = 0; col < numInCol; col++) printf("%.1f ", mFitMat[numCol * row + col]); printf("%.6f %.6f\n", mFitMat[numCol * row + numInCol], mFitMat[numCol * row + numInCol + 1]);*/ row++; } } numData = row; // Do robust fitting if enough data, fall back to regular fit on error maxZeroWgt = (int)B3DMIN(0.1 * numData, numFrames - 3); if (doRobust) { row = robustRegress(mFitMat, numCol, 1, numInCol, numData, 2, solMat, numInCol, NULL, xMean, xSD, mFitWork, mKfactor, &numIter, maxIter, maxZeroWgt, maxChange, maxOscill); if (row) { if (mDebug) utilPrint("robustRegress%s failed with error %d\n", filt != fitInd ? " for CV" : "" ,row); doRobust = false; } } if (!doRobust) multRegress(mFitMat, numCol, 1, numInCol, numData, 2, 0, solMat, numInCol, NULL, xMean, xSD, mFitWork); // Copy to the appropriate fitShifts for (ind = 0; ind < numInCol; ind++) { finalX -= solMat[ind]; finalY -= solMat[ind + numInCol]; mXfitShifts[fitInd][ind] = solMat[ind]; mYfitShifts[fitInd][ind] = solMat[ind + numInCol]; //PRINT2(solMat[ind], solMat[ind + numInCol]); } mXfitShifts[fitInd][numInCol] = finalX; mYfitShifts[fitInd][numInCol] = finalY; //PRINT2(finalX, finalY); } /* * Finish aligning and averaging the remaining frames and return results */ int FrameAlign::finishAlignAndSum(float refineRadius2, float refineSigma2, float iterCrit, int groupRefine, int doSpline, float *alisum, float *xShifts, float *yShifts, float *rawXshifts, float *rawYshifts, float *ringCorrs, float deltaR, int &bestFilt, float *smoothDist, float *rawDist, float *resMean, float *predMean, float *meanResMax, float *maxResMax, float *meanRawMax, float *maxRawMax, float *evenSum, float *oddSum) { int ind, numPix, frame, maxAsBest, filt, useFilt, iter, ierr, useFrame, numAlign,binInd; float shiftX, shiftY, error, minError, maxRefine; FloatVec refXshift, refYshift, cumXshift, cumYshift, groupXshift, groupYshift; float *realSum = mFullEvenSum; float *evenSource = mFullEvenSum; float *binArr; int nxBin = mNx / mBinSum; int nyBin = mNy / mBinSum; int xStart = (mSumXpad - nxBin) / 2; int xEnd = xStart + nxBin - 1; int yStart = (mSumYpad - nyBin) / 2; int yEnd = yStart + nyBin - 1; bool processFull = mSummingMode <= 0 && !mDeferSumming; double wallRefine = 0; int numAVAforFrames = mNumAllVsAll + mGroupSize - 1; // If nothing is aligned, it is an error if (!mNumFrames) return 1; // Finish up with all-vs-all if (mNumAllVsAll && mSummingMode >= 0) { findAllVsAllAlignment(false); // pick a best filter if haven't got one yet if (mNumFilters > 1 && !mPickedBestFilt) { maxAsBest = 0; for (ind = 0; ind < mNumFilters; ind++) ACCUM_MAX(maxAsBest, mNumAsBestFilt[ind]); mBestFilt = 0; minError = 1.e30f; for (ind = 0; ind < mNumFilters; ind++) { error = (1.f - mMaxMaxWeight) * mPredMeanSum[ind] / B3DMAX(1, mNumFits) + mMaxMaxWeight * mMaxResMax[ind]; if (mNumAsBestFilt[ind] > -mFailedOftenCrit && error < minError && !(maxAsBest >= mNumAsBestFilt[ind] + mPickDiffCrit && maxAsBest >= mPickRatioCrit * mNumAsBestFilt[ind])) { minError = error; mBestFilt = ind; } } } // Then take care of shifts if (mNumFrames <= numAVAforFrames) { // Take shifts as is if never got any before and add these images for (ind = 0; ind < mNumFrames; ind++) { for (filt = 0; filt <= mNumFilters && ind < mNumFrames + 1 - mGroupSize; filt++) { useFilt = filt == mNumFilters ? mBestFilt : filt; mXshifts[filt].push_back(mXfitShifts[useFilt][ind]); mYshifts[filt].push_back(mYfitShifts[useFilt][ind]); } if (processFull && addToSums(NULL, ind, -9, ind)) return 3; } } else { // Or adjust and add in ALL the shifts this time and add the images for (filt = 0; filt <= mNumFilters; filt++) { useFilt = filt == mNumFilters ? mBestFilt : filt; adjustAndPushShifts(mNumAllVsAll - 1, filt, useFilt); } for (ind = 1; ind < numAVAforFrames; ind++) if (processFull && addToSums(NULL, ind - 1, -9, mNumFrames + ind - numAVAforFrames)) return 3; } } bestFilt = mBestFilt; useFilt = mUseHybrid ? mNumFilters : mBestFilt; // Convert the shifts from group to frame groupXshift = mXshifts[useFilt]; groupYshift = mYshifts[useFilt]; getAllFrameShifts(refXshift, refYshift, useFilt); mXshifts[useFilt] = refXshift; mYshifts[useFilt] = refYshift; // Now do a refinement with alignment to leave-one-out reference if (mCumAlignAtEnd && mSummingMode >= 0) { START_TIMER; if (mDebug) { smoothDist[useFilt] = smoothedTotalDistance (&mXshifts[useFilt][0], &mYshifts[useFilt][0], (int)mXshifts[useFilt].size(), rawDist[useFilt]); utilPrint("Original distance raw %.2f smoothed %.2f\n", rawDist[useFilt], smoothDist[useFilt]); } // Get a new high-frequency filter mask numPix = mAlignPix; if (!refineRadius2) { refineRadius2 = mRadius2[mBestFilt]; refineSigma2 = mSigma2[mBestFilt]; } XCorrSetCTF(0., refineSigma2 * mBinAlign, 0., refineRadius2 * mBinAlign, mFiltFunc, mAlignXpad, mAlignYpad, &mFiltDelta); for (ind = 0; ind < numPix; ind++) mFullFiltMask[ind] = 1.; XCorrFilterPart(mFullFiltMask, mFullFiltMask, mAlignXpad, mAlignYpad, mFiltFunc, mFiltDelta); if (mGpuAligning && sFgpuNewFilterMask(mFullFiltMask)) { if (recoverGpuAlignFFTs(false, -1, NULL, NULL, NULL, false)) return 3; } // The real accumulated shifts are kept in cumXYshift; the mXYshifts are the ones // that get applied to frames on each iteration if (mGroupSize > 1 && groupRefine) { //cumXshift = groupXshift; //cumYshift = groupYshift; numAlign = mNumFrames + 1 - mGroupSize; } else { groupRefine = 0; numAlign = mNumFrames; } cumXshift = mXshifts[useFilt]; cumYshift = mYshifts[useFilt]; for (iter = 0; iter < mCumAlignAtEnd; iter++) { mGroupSize = 1; if (mGpuAligning) sFgpuSetGroupSize(1); refXshift.clear(); refYshift.clear(); if (iter) { if (mGpuAligning && sFgpuClearAlignSum()) return 3; else if (!mGpuAligning) memset(mAlignSum, 0, mAlignBytes); } // Loop on frames to shift the bin pad image into alignment and add to sum // Skip this on the first iteration for cumulative alignment if (iter || mNumAllVsAll) { for (frame = 0; frame < mNumFrames; frame++) { ierr = addToSums(NULL, -9, frame, frame, useFilt); if (ierr) return ierr; } } // Loop on frames to align maxRefine = 0.; for (frame = 0; frame < numAlign; frame++) { useFrame = frame; if (!mGpuAligning) binArr = mSavedBinPad[frame]; // Subtract this frame from the align sum and filter it if (groupRefine) { // For group refine, add up the frames in group mGroupSize = mGroupSizeInitial; sFgpuSetGroupSize(mGroupSize); useFrame = 0; // Do that on the GPU or in memory if (mGpuAligning) { if (sFgpuSumIntoGroup(frame + mGroupSize - 1, 0)) { if (recoverGpuAlignFFTs(false, -1, mAlignSum, NULL, NULL, false)) return 3; } } if (!mGpuAligning) { binArr = mSavedGroups[0]; memset(binArr, 0, mAlignBytes); for (binInd = frame; binInd < frame + mGroupSize; binInd++) for (ind = 0; ind < mAlignPix; ind++) binArr[ind] += mSavedBinPad[binInd][ind]; } } // If on GPU, that now needs subtracting and filtering if (mGpuAligning) { if (sFgpuSubtractAndFilterAlignSum(useFrame, groupRefine)) { if (recoverGpuAlignFFTs(false, -1, mAlignSum, NULL, NULL, false)) return 3; binArr = groupRefine ? mSavedGroups[0] : mSavedBinPad[frame]; } } // Or, subtract and filter on CPU if (!mGpuAligning) for (ind = 0; ind < numPix; ind++) mWorkBinPad[ind] = (mAlignSum[ind] - binArr[ind]) * mFullFiltMask[ind]; // Align it to LOO sum. May want to pass a smaller max shift ierr = alignTwoFrames(-2, useFrame, 0., 0., 0, shiftX, shiftY, false, mDumpRefCorrs); if (ierr) return ierr; refXshift.push_back(shiftX); refYshift.push_back(shiftY); error = sqrt(shiftX * shiftX + shiftY * shiftY); ACCUM_MAX(maxRefine, error); } if (groupRefine) { for (frame = 0; frame < mNumFrames; frame++) { mXshifts[useFilt][frame] = 0.; mYshifts[useFilt][frame] = 0.; } for (frame = 0; frame < numAlign; frame++) { for (ind = frame; ind < frame + mGroupSize; ind++) { mXshifts[useFilt][ind] += refXshift[frame] / mGroupSize; mYshifts[useFilt][ind] += refYshift[frame] / mGroupSize; } } refXshift = mXshifts[useFilt]; refYshift = mYshifts[useFilt]; } else { // And copy the refineshift over to be applied next time mXshifts[useFilt] = refXshift; mYshifts[useFilt] = refYshift; } // Adjust the shifts: add them to cumulative shift for (frame = 0; frame < mNumFrames; frame++) { cumXshift[frame] += refXshift[frame]; cumYshift[frame] += refYshift[frame]; if (mDebug) utilPrint("%d %2d %.2f %.2f %.2f %.2f\n", iter, frame, refXshift[frame], refYshift[frame], cumXshift[frame], cumYshift[frame]); } if (maxRefine < iterCrit) break; } // At end, put the full shifts back mXshifts[useFilt] = cumXshift; mYshifts[useFilt] = cumYshift; ADD_TIME(wallRefine); } // adjust shifts for initial cumulative alignment to have a mean of 0 if (!mNumAllVsAll && (mDeferSumming || mSummingMode > 0)) { shiftX = shiftY = 0.; for (ind = 0; ind < mNumFrames; ind++) { shiftX += mXshifts[useFilt][ind] / mNumFrames; shiftY += mYshifts[useFilt][ind] / mNumFrames; } for (ind = 0; ind < mNumFrames; ind++) { mXshifts[useFilt][ind] -= shiftX; mYshifts[useFilt][ind] -= shiftY; } } // Save to raw shifts and apply spline smoothing now before shifts get used for (ind = 0; ind < mNumFrames; ind++) { rawXshifts[ind] = mXshifts[useFilt][ind]; rawYshifts[ind] = mYshifts[useFilt][ind]; } // Spline smoothing: Get the raw distance first, use the spline as smoothed distance if (doSpline && mSummingMode >= 0) { smoothedTotalDistance(&mXshifts[useFilt][0], &mYshifts[useFilt][0], (int)mXshifts[useFilt].size(), rawDist[useFilt]); ind = splineSmooth(rawXshifts, rawYshifts, mNumFrames, &mXshifts[useFilt][0], &mYshifts[useFilt][0], smoothDist[useFilt]); if (ind) { utilPrint("Spline smoothing of shifts failed with return value %d", ind); return 1; } } // Sum now if needed, do FRC, then inverse transform and extract appropriate area mGroupSize = 1; if (mSummingMode <= 0) { if (mDeferSumming) { // Make the sum if (mGpuSumming && (mGpuFlags & GPU_FOR_ALIGNING)) { sFgpuCleanAlignItems(); sFgpuSetUnpaddedSize(mNx, mNy, mFlagsForUnpadCall, (mDebug ? 1 : 0) + (mReportTimes ? 10 : 0)); if (sFgpuSetupSumming(mFullXpad, mFullYpad, mSumXpad, mSumYpad, mEvenOddForSumSetup)) { if (recoverFromSummingFailure(NULL, 0, 0)) return 3; } } for (frame = 0; frame < mNumFrames; frame++) if (addToSums(NULL, frame, -9, frame, useFilt)) return 3; } if (mGpuSumming) { ind = sFgpuReturnSums(mWorkFullSize, mFullEvenSum, mFullOddSum, 0); // If there is no real sum and no even sum, we can't do anything if (ind == 3) return 3; // If there is an error getting real sum but even/odd is there, just cancel flags // to process the FFT(s) in the even/odd sums if (ind & 2) mGpuFlags = 0; // If there is no error, use the real array for extraction if (!ind) realSum = mWorkFullSize; if ((ind & 1) || !(mGpuFlags & GPU_DO_EVEN_ODD)) ringCorrs = NULL; } if (ringCorrs) fourierRingCorr(mFullEvenSum, mFullOddSum, mSumXpad, mSumYpad, ringCorrs, (int)floor(0.5 / deltaR), deltaR, mWorkFullSize); if (mDumpEvenOdd) { utilDumpFFT(mFullEvenSum, mSumXpad, mSumYpad, "even sum", 1); utilDumpFFT(mFullOddSum, mSumXpad, mSumYpad, "odd sum", 1); } if (!mGpuSumming || !mGpuFlags) { // If doing even and odd, copy the even FFT to output buffer before it is added to if (evenSum && oddSum) { evenSource = evenSum; for (ind = 0; ind < (mSumXpad + 2) * mSumYpad; ind++) evenSum[ind] = mFullEvenSum[ind]; } for (ind = 0; ind < (mSumXpad + 2) * mSumYpad; ind++) mFullEvenSum[ind] += mFullOddSum[ind]; START_TIMER; todfftc(mFullEvenSum, mSumXpad, mSumYpad, 1); ADD_TIME(mWallFullFFT); } extractWithBinning(realSum, MRC_MODE_FLOAT, mSumXpad + 2, xStart, xEnd, yStart, yEnd, 1, alisum, 0, &nxBin, &nyBin); // Do the even and odd sums if requested if (evenSum && oddSum) { todfftc(evenSource, mSumXpad, mSumYpad, 1); todfftc(mFullOddSum, mSumXpad, mSumYpad, 1); extractWithBinning(evenSource, MRC_MODE_FLOAT, mSumXpad + 2, xStart, xEnd, yStart, yEnd, 1, evenSum, 0, &nxBin, &nyBin); extractWithBinning(mFullOddSum, MRC_MODE_FLOAT, mSumXpad + 2, xStart, xEnd, yStart, yEnd, 1, oddSum, 0, &nxBin, &nyBin); } } // return best shifts for (ind = 0; ind < mNumFrames; ind++) { //frameShiftFromGroups(ind, useFilt, xShifts[ind], yShifts[ind]); xShifts[ind] = mXshifts[useFilt][ind]; yShifts[ind] = mYshifts[useFilt][ind]; } // Return all the results for (filt = 0; filt <= mNumFilters; filt++) { if (!doSpline || filt != useFilt) smoothDist[filt] = smoothedTotalDistance(&mXshifts[filt][0], &mYshifts[filt][0], (int)mXshifts[filt].size(), rawDist[filt], &refXshift[0], &refYshift[0]); ind = B3DMAX(1, mNumFits); resMean[filt] = mResMeanSum[filt] / ind; predMean[filt] = mPredMeanSum[filt] / ind; meanResMax[filt] = mResMaxSum[filt] / ind; maxResMax[filt] = mMaxResMax[filt]; meanRawMax[filt] = mRawMaxSum[filt] / ind; maxRawMax[filt] = mMaxRawMax[filt]; mPredMeanSum[filt] /= ind; } if (mReportTimes) utilPrint("FullFFT %.3f BinPad %.3f BinFFT %.3f Reduce %.3f Shift %.3f Filt " "%.3f\nConjProd %.3f PreProc %.3f Noise %.3f Sum of those %.3f " " Refine %.3f\n", mWallFullFFT, mWallBinPad, mWallBinFFT, mWallReduce, mWallShift, mWallFilter, mWallConjProd, mWallPreProc, mWallNoise, mWallFullFFT + mWallBinPad + mWallBinFFT + mWallReduce + mWallShift + mWallFilter + mWallConjProd + mWallPreProc + mWallNoise, wallRefine); if (mGpuFlags && mReportTimes) sFgpuPrintTimers(); return 0; } /* * Get the unweighted sum that was made in tandem with the dose-weighted one */ int FrameAlign::getUnweightedSum(float *nonDWsum) { int nxBin = mNx / mBinSum; int nyBin = mNy / mBinSum; int xStart = (mSumXpad - nxBin) / 2; int xEnd = xStart + nxBin - 1; int yStart = (mSumYpad - nyBin) / 2; int yEnd = yStart + nyBin - 1; if (!mMakeUnwgtSum || (mMakeUnwgtSum && mUnwgtOnGpu && !mGpuSumming)) { utilPrint("There is no unweighted sum available"); return 1; } if (mUnwgtOnGpu) { if (sFgpuReturnUnweightedSum(mUnweightSum)) return 1; } else { START_TIMER; todfftc(mUnweightSum, mSumXpad, mSumYpad, 1); ADD_TIME(mWallFullFFT); } extractWithBinning(mUnweightSum, MRC_MODE_FLOAT, mSumXpad + 2, xStart, xEnd, yStart, yEnd, 1, nonDWsum, 0, &nxBin, &nyBin); return 0; } /* * Align the frame in binArr to the one in refArr and return the shifts * Reference refInd >= 0 for saved frame, -1 for mAlignSum, -2 for mWorkBinPad * Image to align: aliInd >= 0 for saved frame, -1 for mWorkBinPad */ int FrameAlign::alignTwoFrames(int refInd, int aliInd, float nearXshift, float nearYshift, int filtInd, float &xShift, float &yShift, bool filterSubarea, bool dump) { int limXlo, limXhi, limYlo, limYhi, ind, indPeak; float peaks[3], xpeaks[3], ypeaks[3], widths[3], minWidths[3]; float xTemp[2] = {0., 0.}, yTemp[2] = {0., 0.}, expDist[2]; float atZeroCrit = (float)(0.1 / mBinAlign); float peakRatioCrit = 2.; float thirdPeakCrit = 3.; float expDistRatioCrit = 2.; float minExpDist = 4.; float widthRatioCrit = 0.8f; float *refArr, *binArr; bool useSubarea = filterSubarea || mGpuAligning; float *corrTemp = useSubarea ? mCorrFiltTemp : mCorrBinPad; int subXoffset = useSubarea ? B3DNINT(-nearXshift / mBinAlign) : 0; int subYoffset = useSubarea ? B3DNINT(-nearYshift / mBinAlign) : 0; int aliXsize = useSubarea ? mAliFiltSize : mAlignXpad; int aliYsize = useSubarea ? mAliFiltSize : mAlignYpad; std::vector *savedVec = mGroupSize > 1 ? &mSavedGroups : &mSavedBinPad; // Going to store shifts but they will be negative because we are getting shift to align // reference to frame. So take negative shift here limXlo = (int)((-nearXshift - mMaxShift) / mBinAlign) - subXoffset; limXhi = (int)ceil((-nearXshift + mMaxShift) / mBinAlign) - subXoffset; limYlo = (int)((-nearYshift - mMaxShift) / mBinAlign) - subYoffset; limYhi = (int)ceil((-nearYshift + mMaxShift) / mBinAlign) - subYoffset; if (!filtInd) { if (mGpuAligning) { // For GPU alignment, it extracts the wrapped image with origin in center // which is ready for filtering the subarea if (sFgpuCrossCorrelate(aliInd, refInd, mTempSubFilt, subXoffset, subYoffset)) { if (recoverGpuAlignFFTs(false, -1, refInd == -1 ? mAlignSum : NULL, (refInd < -1 || aliInd < 0) ? mWorkBinPad : NULL, NULL, false)) return 3; } else if (!filterSubarea) { // But if we are not filtering, need to wrap back into corr array wrapImage(mTempSubFilt, aliXsize + 2, aliXsize, aliYsize, corrTemp, aliXsize + 2, aliXsize, aliYsize, 0, 0); } } if (!mGpuAligning) { // Assign arrays from indexes if not aligning on GPU if (refInd >= (int)savedVec->size() || aliInd >= (int)savedVec->size()) return 2; if (refInd >= 0) refArr = (*savedVec)[refInd]; else if (refInd == -1) refArr = mAlignSum; else refArr = mWorkBinPad; if (aliInd >= 0) binArr = (*savedVec)[aliInd]; else binArr = mWorkBinPad; // Copy into the correlation array memcpy(mCorrBinPad, binArr, mAlignBytes); // Get product START_TIMER; conjugateProduct(mCorrBinPad, refArr, mAlignXpad, mAlignYpad); ADD_TIME(mWallConjProd); // Inverse FFT START_TIMER; todfftc(mCorrBinPad, mAlignXpad, mAlignYpad, 1); ADD_TIME(mWallBinFFT); if (dump && filterSubarea) utilDumpImage(mCorrBinPad, mAlignXpad + 2, mAlignXpad, mAlignYpad, 1, "lf correlation", mNumFrames); // If high frequency filter being applied to subarea, extract subarea if (filterSubarea) wrapImage(mCorrBinPad, mAlignXpad + 2, mAlignXpad, mAlignYpad, mTempSubFilt, mAliFiltSize + 2, mAliFiltSize, mAliFiltSize, subXoffset, subYoffset); } if (filterSubarea) sliceTaperInPad(mTempSubFilt, SLICE_MODE_FLOAT, mAliFiltSize + 2, 0, mAliFiltSize - 1, 0, mAliFiltSize - 1, mTempSubFilt, mAliFiltSize + 2, mAliFiltSize, mAliFiltSize, 8, 8); } //dumpImage(mTempSubFilt, aliXsize + 2, aliXsize, aliYsize, 0, "extract"); // Filter subarea to temp array if doing that if (filterSubarea) { START_TIMER; memcpy(mWrapTemp, mTempSubFilt, (aliXsize + 2) * aliYsize * sizeof(float)); todfftc(mWrapTemp, aliXsize, aliYsize, 0); for (ind = 0; ind < (aliXsize + 2) * aliYsize; ind++) mWrapTemp[ind] = mWrapTemp[ind] * mSubFiltMask[filtInd][ind]; todfftc(mWrapTemp, aliXsize, aliYsize, 1); wrapImage(mWrapTemp, aliXsize + 2, aliXsize, aliYsize, corrTemp, aliXsize + 2, aliXsize, aliYsize, 0, 0); ADD_TIME(mWallFilter); } if (dump) utilDumpImage(corrTemp, aliXsize + 2, aliXsize, aliYsize, 1, "correlation", mNumFrames); setPeakFindLimits(limXlo, limXhi, limYlo, limYhi, 1); XCorrPeakFindWidth(corrTemp, aliXsize + 2, aliYsize, xpeaks, ypeaks, peaks, widths, minWidths, 2, 0); indPeak = 0; for (ind = 0; ind < 2; ind++) { if (peaks[ind] > -1.e29) { xTemp[ind] = -(subXoffset + xpeaks[ind]) * mBinAlign; yTemp[ind] = -(subYoffset + ypeaks[ind]) * mBinAlign; expDist[ind] = (float)sqrt(pow((double)xTemp[ind] - nearXshift, 2.) + pow((double)yTemp[ind] - nearYshift, 2.)); if (ind && fabs(xTemp[0]) < atZeroCrit && fabs(yTemp[0]) < atZeroCrit && peaks[1] > peakRatioCrit * peaks[0] && widths[1] < widthRatioCrit * widths[0] && peaks[2] < thirdPeakCrit * peaks[1] && (expDist[1] < expDistRatioCrit * expDist[0] || (expDist[0] < minExpDist && expDist[1] < minExpDist))) { indPeak = 1; if (mDebug) utilPrint("reject peak at %.2f %.2f for %.2f %.2f\n" "peaks: %g %g %g widths %.2f %.2f expDist %.2f %.2f\n", xTemp[0], yTemp[0], xTemp[1], yTemp[1], peaks[0], peaks[1], peaks[2], widths[0], widths[1], expDist[0], expDist[1]); } } } xShift = xTemp[indPeak]; yShift = yTemp[indPeak]; return 0; } /* CALLING USAGE: Nextframe: calls with first one when first numAVA is done calls with second one on stack that first time and later times But the stack is rolled by 1 only call with fullArr when SummingMode < 0 (just summing) For cumulative align, if not deferring, calls with fullArr and binInd finishAlignAndSum: If no deferred sum and no frames done, calls with each saved frame on stack If > numAVA frames done, calls with each frame from 1 to numAVA - 1 If refining, calls with NULL and frame number to add up bin-pads Calls with every frame on stack for any deferred summing */ /* * Shift and add image to full sum, and bin/pad image to cumulative alignment sum if * binInd >= -1 * sumInd is >= 0 if it comes from stack (so can be a frame number), -1 to use fullArr * for a full frame, or < -1 for no full frame sum * Here binInd >= 0 for a saved frame, -1 for mWorkBinPad, < -1 for nothing * frameNum is the absolute frame number */ int FrameAlign::addToSums(float *fullArr, int sumInd, int binInd, int frameNum, int filtInd) { int ind; float *binArr; if (filtInd < 0) filtInd = mNumFilters; float xShift, yShift; float *fullSum = (frameNum % 2) ? mFullOddSum : mFullEvenSum; frameShiftFromGroups(frameNum, filtInd, xShift, yShift); if (sumInd == -1 && !fullArr) { utilPrint("Program error in addToSums: sumInd = -1 and fullArr is NULL\n"); return 1; } // Shift full image and add into final sum if one is passed if (sumInd >= -1) { // Get a full-sized dose-weight filter regardless of binning because that is needed // for the GPU case if (mDoingDoseWeighting) { doseWeightFilter(mPriorDoseCum, mPriorDoseCum + mFrameDoses[frameNum], mPixelSize, mCritDoseAfac, mCritDoseBfac, mCritDoseCfac, mCritDoseScale, &mDoseWgtFilter[0], (int)mDoseWgtFilter.size(), 0.71f, &mDWFdelta); if (mReweightFilt.size()) { for (ind = 0; ind < (int)mDoseWgtFilter.size(); ind++) mDoseWgtFilter[ind] *= mReweightFilt[ind]; } if (mDebug > 1) { utilPrint("1/pixel Attenuation Dose weight filter for frame %d:\n", frameNum); for (ind = 0; ind < (int)mDoseWgtFilter.size(); ind += (int)mDoseWgtFilter.size() / 35) { utilPrint("%.4f %.4f\n", ind, mDWFdelta * ind, mDoseWgtFilter[ind]); if (!mDoseWgtFilter[ind]) break; } } mPriorDoseCum += mFrameDoses[frameNum]; } // Replace fullArr if it is indeed the first one on the stack here, only test this // if there was local stacking at all, i.e. no GPU summing or not stacking or // stacking but with a limit if (sumInd >= 0 && mSavedFullFrameNum.size() > 0 && (!mGpuSumming || !mStackUnpadOnGpu || (mStackUnpadOnGpu && mGpuStackLimit > 0))) { if (mSavedFullFrameNum[0] == frameNum) { fullArr = mSavedFullSize[0]; } else if (!mGpuSumming || !mStackUnpadOnGpu) { utilPrint("Next frame to be summed is not the first on the saved memory stack\n"); return 1; } } //dumpFFT(fullArr, mFullXpad, mFullYpad, "full pad to sum", 1); // Try to do sum on GPU if flag set if (mGpuSumming) { ind = (int)mDoseWgtFilter.size(); if (sFgpuSetupDoseWeighting(ind > 0 ? &mDoseWgtFilter[0] : NULL, ind, mDWFdelta) || sFgpuAddToFullSum(fullArr, xShift, yShift)) { if (recoverFromSummingFailure(&fullArr, frameNum, sumInd)) return 3; } } // Do sum into arrays here if (mBinSum > 1 && !mGpuSumming) { START_TIMER; fourierReduceImage(fullArr, mFullXpad, mFullYpad, mReduceTemp, mSumXpad, mSumYpad, xShift, yShift, mShiftTemp); ADD_TIME(mWallReduce); START_TIMER; // Just scale the delta by the binning to use the initial part of the filter // on already-reduced images if (mDoingDoseWeighting) { filterAndAddToSum(mReduceTemp, fullSum, mSumXpad, mSumYpad, &mDoseWgtFilter[0], mDWFdelta * mBinSum); if (mMakeUnwgtSum && !mUnwgtOnGpu) { for (ind = 0; ind < (mSumXpad + 2) * mSumYpad; ind++) mUnweightSum[ind] += mReduceTemp[ind]; } //utilDumpFFT(mReduceTemp, mSumXpad, mSumYpad, "reduced", 0, frameNum); } else { for (ind = 0; ind < (mSumXpad + 2) * mSumYpad; ind++) fullSum[ind] += mReduceTemp[ind]; } ADD_TIME(mWallFilter); } else if (!mGpuSumming) { START_TIMER; fourierShiftImage(fullArr, mFullXpad, mFullYpad, xShift, yShift , mShiftTemp); ADD_TIME(mWallShift); START_TIMER; if (mDoingDoseWeighting) { filterAndAddToSum(fullArr, fullSum, mFullXpad, mFullYpad, &mDoseWgtFilter[0], mDWFdelta); if (mMakeUnwgtSum && !mUnwgtOnGpu) { for (ind = 0; ind < (mFullXpad + 2) * mFullYpad; ind++) mUnweightSum[ind] += fullArr[ind]; } //utilDumpFFT(fullArr, mFullXpad, mFullYpad, "shifted", 0, frameNum); } else { for (ind = 0; ind < (mFullXpad + 2) * mFullYpad; ind++) fullSum[ind] += fullArr[ind]; } ADD_TIME(mWallFilter); //utilDumpFFT(fullSum, mFullXpad, mFullYpad, "cur sum", 1, mNumFrames); } } // Roll the frame buffer and reduce the number saved if (sumInd >= -1 && (mNumAllVsAll || mDeferSumming)) { if (fullArr && mNumFullSaved > 0) { /*for (int jnd = 0; jnd < mNumFullSaved; jnd++) printf("%d ", mSavedFullFrameNum[jnd]); printf("\n");*/ utilRollSavedFrames(mSavedFullSize, mNumFullSaved); mNumFullSaved--; for (ind = 0; ind < mNumFullSaved; ind++) mSavedFullFrameNum[ind] = mSavedFullFrameNum[ind + 1]; //PRINT3(mSavedFullFrameNum[0], mNumFullSaved, mSavedFullFrameNum[mNumFullSaved-1]); /*for (int jnd = 0; jnd < mNumFullSaved; jnd++) printf("%d ", mSavedFullFrameNum[jnd]); printf("\n");*/ } else if (!fullArr && mNumStackedOnGpu > 0) { mNumStackedOnGpu--; } } // If there is a legal binInd, shift it and add to align sum if (binInd < -1) return 0; if (mGpuAligning) { // Shift and add: but don't bother shifting the source when doing simple cum corr // An error is fatal when shifting in place since we have no idea if it happened, // and we also don't know the state of the align sum if (sFgpuShiftAddToAlignSum(binInd, xShift / mBinAlign, yShift / mBinAlign, binInd < 0 ? 0 : 1)) return 3; } else { START_TIMER; if (binInd >= (int)mSavedBinPad.size()) return 2; binArr = binInd < 0 ? mWorkBinPad : mSavedBinPad[binInd]; fourierShiftImage(binArr, mAlignXpad, mAlignYpad, xShift / mBinAlign, yShift / mBinAlign, mShiftTemp); for (ind = 0; ind < mAlignPix; ind++) mAlignSum[ind] += binArr[ind]; ADD_TIME(mWallShift); } return 0; } /* * Adjust shifts by the cumulative difference from the first set of shifts used, and * store them */ void FrameAlign::adjustAndPushShifts(int topInd, int filt, int useFilt) { int ind; float xDiff, yDiff, xSD, ySD, sem; // Get mean difference between last and this set of shifts for (ind = 0; ind < mNumAllVsAll - 1; ind++) { mLastXfit[filt][ind + 1] = mXfitShifts[useFilt][ind] - mLastXfit[filt][ind + 1]; mLastYfit[filt][ind + 1] = mYfitShifts[useFilt][ind] - mLastYfit[filt][ind + 1]; //xDiff += (mXfitShifts[ind] - mLastXfit[ind + 1]) / (mNumAllVsAll - 1); //yDiff += (mYfitShifts[ind] - mLastYfit[ind + 1]) / (mNumAllVsAll - 1); } avgSD(&mLastXfit[filt][1], mNumAllVsAll - 1, &xDiff, &xSD, &sem); avgSD(&mLastYfit[filt][1], mNumAllVsAll - 1, &yDiff, &ySD, &sem); /* utilPrint("filt %d useFilt %d diff mean %.2f %.2f SD %.2f %.2f\n", filt, useFilt, xDiff, yDiff, xSD, ySD); */ //Add to cumulative difference and push new shifts adjusted by this diff mCumulXdiff[filt] += xDiff; mCumulYdiff[filt] += yDiff; for (ind = 1; ind <= topInd; ind++) { mXshifts[filt].push_back(mXfitShifts[useFilt][ind] - mCumulXdiff[filt]); mYshifts[filt].push_back(mYfitShifts[useFilt][ind] - mCumulYdiff[filt]); } } /* * Determine how many frames need to be copied from GPU after a failure and allocate * arrays here; the vector operations at least do not work when vector is shared with * a DLL for SEMCCD */ int FrameAlign::prepareToFetchAlignFFTs(int aliFrameInd) { int numBinPads, numGroups, ind; float *tempBin; sFgpuNumberOfAlignFFTs(&numBinPads, &numGroups); if (aliFrameInd >= numBinPads) numBinPads = aliFrameInd + 1; for (ind = (int)mSavedBinPad.size(); ind < numBinPads; ind++) { tempBin = B3DMALLOC(float, mAlignPix); if (testAndCleanup(!tempBin)) return 1; mSavedBinPad.push_back(tempBin); } for (ind = (int)mSavedGroups.size(); ind < numGroups; ind++) { tempBin = B3DMALLOC(float, mAlignPix); if (testAndCleanup(!tempBin)) return 1; mSavedGroups.push_back(tempBin); } return 0; } /* * Higher level routine to recover align FFTs will also recover the full stack if there * is noise padding, and take FFTs of individual frames when grouping with no refine at * end */ int FrameAlign::recoverGpuAlignFFTs(bool saving, int aliFrameInd, float *alignSum, float *workArr, float **binArr, bool stacking) { int ind, ix; float *tempBin; // Fix up the full stack if doing noise pad on GPU and cancel all initial steps there if (mNoisePadOnGpu) { if (recoverGpuFullStack(stacking, NULL)) { cleanup(); return 3; } } cancelInitialStepsOnGPU(); // Get the arrays made up if (prepareToFetchAlignFFTs(saving ? aliFrameInd : -1)) return 3; // Get the data back if (sFgpuReturnAlignFFTs(mSavedBinPad.size() ? &mSavedBinPad[0] : NULL, mSavedGroups.size() ? &mSavedGroups[0] : NULL, alignSum, workArr)) { cleanup(); return 3; } mGpuAligning = false; // Grouping with no refine at end was saved as real space, so need to take FFTs if (mGroupSize > 1 && !mCumAlignAtEnd) { for (ind = 0; ind < (int)mSavedBinPad.size() - (saving ? 1 : 0); ind++) { todfftc(mSavedBinPad[ind], mAlignXpad, mAlignYpad, 0); for (ix = 0; ix < mAlignPix; ix++) mSavedBinPad[ind][ix] *= mFullFiltMask[ix]; } } // Take care of current align array if provided: put it on the stack if (saving && binArr) { tempBin = mSavedBinPad[aliFrameInd]; memcpy(tempBin, *binArr, mAlignBytes); *binArr = tempBin; } utilPrint("Switching to aligning with the CPU\n"); return 0; } /* * Get back the stack of full-sized images from the GPU and get larger arrays for ones on * the CPU stack if necessary; preprocess if needed, and noise pad them. */ int FrameAlign::recoverGpuFullStack(bool stacking, float **binArr) { int numSavedStart = mNumFullSaved; int ind, jnd, frameNum, sourceType; float *destArr, *sourceArr; bool needPreprocess = (mGainRef || mTruncLimit > 0. || mCamSizeX > 0) && (mGpuFlags & GPU_DO_PREPROCESS); float *allocedArr = NULL; // Loop on frames in CPU stack then ones in GPU stack, then maybe the one being stacked for (ind = 0; ind < numSavedStart + mNumStackedOnGpu + (stacking ? 1 : 0); ind++) { // Allocate an array if stack is not floats or stack not big enough if (mStackType != MRC_MODE_FLOAT || ind >= (int)mSavedFullSize.size()) { allocedArr = B3DMALLOC(float, (mFullXpad + 2) * mFullYpad); if (!allocedArr) return 1; destArr = allocedArr; } else { // Otherwise point final array to stack buffer destArr = mSavedFullSize[ind]; } // Set source as stack if on CPU and save frame number if (ind < numSavedStart) { sourceArr = mSavedFullSize[ind]; frameNum = mSavedFullFrameNum[ind]; } else if (ind < numSavedStart + mNumStackedOnGpu) { // Or fetch into the allocated array sourceArr = allocedArr; if (sFgpuReturnStackedFrame(sourceArr, &frameNum)) { free(allocedArr); return 1; } mNumFullSaved++; } else { // Or use the passed array for current frame sourceArr = *binArr; frameNum = mNumFrames; *binArr = destArr; mNumFullSaved++; } sourceType = mStackType; // Preprocess into work array if needed if (needPreprocess) { preProcessFrame(sourceArr, NULL, mDefectBin, mWorkFullSize); sourceType = MRC_MODE_FLOAT; sourceArr = mWorkFullSize; } // Noise taper pad into the destination array sliceNoiseTaperPad(sourceArr, sourceType, mNx, mNy, destArr, mFullXpad + 2, mFullXpad, mFullYpad, mNoiseLength, 4, mShiftTemp); // Replace array on stack if alloced and < size of stack if (ind < (int)mSavedFullSize.size()) { if (allocedArr) { free(mSavedFullSize[ind]); mSavedFullSize[ind] = allocedArr; mSavedFullFrameNum[ind] = frameNum; } } else { // Otherwise push onto stack mSavedFullSize.push_back(allocedArr); mSavedFullFrameNum.push_back(frameNum); } allocedArr = NULL; } // Sort the stack by frame number for (ind = 0; ind < mNumFullSaved - 1; ind++) { for (jnd = ind + 1; jnd < mNumFullSaved; jnd++) { if (mSavedFullFrameNum[jnd] < mSavedFullFrameNum[ind]) { destArr = mSavedFullSize[ind]; mSavedFullSize[ind] = mSavedFullSize[jnd]; mSavedFullSize[jnd] = destArr; frameNum = mSavedFullFrameNum[ind]; mSavedFullFrameNum[ind] = mSavedFullFrameNum[jnd]; mSavedFullFrameNum[jnd] = frameNum; } } } mNumStackedOnGpu = 0; return 0; } /* * Turn off all the flags that make it think preprocessing, bin/pad or noise/pad are on * GPU */ void FrameAlign::cancelInitialStepsOnGPU() { if (mNoisePadOnGpu || mBinPadOnGpu) utilPrint("Switching to preprocessing and other initial steps on CPU\n"); mNoisePadOnGpu = mBinPadOnGpu = false; mStackUnpadOnGpu = false; mGpuFlags &= ~(GPU_DO_NOISE_TAPER | GPU_DO_BIN_PAD | STACK_FULL_ON_GPU | GPU_DO_PREPROCESS); mFlagsForUnpadCall &= ~(GPU_DO_NOISE_TAPER | GPU_DO_BIN_PAD | STACK_FULL_ON_GPU | GPU_DO_PREPROCESS); sFgpuSetUnpaddedSize(mNx, mNy, 0, (mDebug ? 1 : 0) + (mReportTimes ? 10 : 0)); } /* * Higher-level return for recovering when deferred summing setup OR adding to sums * fails; pass the current fullArr address which may be modified, or NULL during setup, * pass the absolute frame number frameNum during summing or 0 in setup, pass the sumInd * when summing. Does cleanup on failures */ int FrameAlign::recoverFromSummingFailure(float **fullArr, int frameNum, int sumInd) { int ind; // Recover by getting existing sum back and taking FFT of current array utilPrint("Switching to summing on CPU\n"); if (mNoisePadOnGpu) { if (recoverGpuFullStack(false, NULL)) { cleanup(); return 3; } if (mSavedFullFrameNum[0] == frameNum) { if (fullArr) *fullArr = mSavedFullSize[0]; } else { utilPrint("After recovering from GPU failure, frame to be summed is not " "first in stack\n"); cleanup(); return 3; } } cancelInitialStepsOnGPU(); if (frameNum > 0) { if (sFgpuReturnSums(mFullEvenSum, mFullEvenSum, mFullOddSum, 1)) { cleanup(); // It didn't cleanup before return 3; } } else { memset(mFullEvenSum, 0, (mSumXpad + 2) * mSumYpad); memset(mFullOddSum, 0, (mSumXpad + 2) * mSumYpad); } //utilDumpFFT(mFullEvenSum, mFullXpad, mFullYpad, "recovered even", 1, mNumFrames); //utilDumpFFT(mFullOddSum, mFullXpad, mFullYpad, "recovered odd", 1, mNumFrames); mGpuSumming = false; if (sumInd < 0 && fullArr) todfftc(*fullArr, mFullXpad, mFullYpad, 0); for (ind = 0; ind < mNumFullSaved; ind++) todfftc(mSavedFullSize[ind], mFullXpad, mFullYpad, 0); return 0; } /* * Return smallest multiple of the two numbers that includes all their divisors */ int FrameAlign::leastCommonMultiple(int num1, int num2) { int fac; for (fac = 64; fac > 1; fac--) if (num2 % fac == 0 && num1 % fac == 0) num2 /= fac; return num1 * num2; } /* * Wrap an image to go between a correlation with the origin in the corner and one with * the origin in the middle */ void FrameAlign::wrapImage(float *bufFrom, int nxDimFrom, int nxFrom, int nyFrom, float *bufTo, int nxDimTo, int nxTo, int nyTo, int xOffset, int yOffset) { int ixFrom0[4], ixTo0[4], iyFrom0[4], iyTo0[4], ixFrom1[4], ixTo1[4], iyFrom1[4]; int iyTo1[4], xnum, ynum, iy, quad; utilCoordsForWrap(nxFrom, nyFrom, nxTo, nyTo, xOffset, yOffset, ixFrom0, ixTo0, iyFrom0, iyTo0, ixFrom1, ixTo1, iyFrom1, iyTo1); for (quad = 0; quad < 4; quad++) { ynum = iyFrom1[quad] + 1 - iyFrom0[quad]; xnum = ixFrom1[quad] + 1 - ixFrom0[quad]; if (xnum > 0 && ynum > 0) { for (iy = 0; iy < ynum; iy++) memcpy(&bufTo[(iy + iyTo0[quad]) * nxDimTo + ixTo0[quad]], &bufFrom[(iy + iyFrom0[quad]) * nxDimFrom + ixFrom0[quad]], xnum * 4); } } } /* * Smooth the trajectory of shifts and compute the total length of it */ float FrameAlign::smoothedTotalDistance(float *xShifts, float *yShifts, int numShifts, float &rawTotal, float *xSmoothed, float *ySmoothed, double *variance) { int numFit = B3DMIN(7, numShifts); int order = numShifts > 4 ? 2 : 1; int numBefore = numFit / 2; float delx, dely, intcpt, slopes[2], ro, sa, sb, se, xpred, prederr, ypred; float lastXpred, lastYpred, dist, varSum = 0; float frame[7] = {0, 1, 2, 3, 4, 5, 6}; float frameSq[7] = {0, 1, 4, 9, 16, 25, 36}; int ind, fitStart, fitEnd; rawTotal = 0; if (numShifts < 2) return 0.; if (numShifts == 2) { delx = xShifts[1] - xShifts[0]; dely = yShifts[1] - yShifts[0]; dist = sqrt(delx * delx + dely * dely); rawTotal = dist; return dist; } dist = 0.; for (ind = 0; ind < numShifts; ind++) { if (ind) { delx = xShifts[ind] - xShifts[ind -1]; dely = yShifts[ind] - yShifts[ind -1]; rawTotal += sqrt(delx * delx + dely * dely); } fitStart = B3DMAX(0, ind - numBefore); fitEnd = B3DMIN(fitStart + numFit - 1, numShifts - 1); fitStart = fitEnd + 1 - numFit; // Get predicted value from fit if (order == 1) { lsFitPred(frame, &xShifts[fitStart], numFit, &slopes[0], &intcpt, &ro, &sa, &sb, &se, frame[ind - fitStart], &xpred, &prederr); lsFitPred(frame, &yShifts[fitStart], numFit, &slopes[0], &intcpt, &ro, &sa, &sb, &se, frame[ind - fitStart], &ypred, &prederr); } else { lsFit2Pred(frame, frameSq, &xShifts[fitStart], numFit, &slopes[0], &slopes[1], &intcpt, frame[ind - fitStart], frameSq[ind - fitStart], &xpred, &prederr); lsFit2Pred(frame, frameSq, &yShifts[fitStart], numFit, &slopes[0], &slopes[1], &intcpt, frame[ind - fitStart], frameSq[ind - fitStart], &ypred, &prederr); } if (ind) { delx = xpred - lastXpred; dely = ypred - lastYpred; dist += sqrt(delx * delx + dely * dely); } lastXpred = xpred; lastYpred = ypred; if (xSmoothed) xSmoothed[ind] = xpred; if (ySmoothed) ySmoothed[ind] = ypred; delx = xpred - xShifts[ind]; dely = ypred - yShifts[ind]; varSum += xpred * xpred + ypred * ypred; } if (variance) *variance = varSum / (2. * numShifts); return dist; } /* * Get a shift for one frame from shifts that may be for groups */ void FrameAlign::frameShiftFromGroups(int frame, int filt, float &shiftX, float &shiftY) { // Spacing between frames is 1 and first frame of group is at // -(group size - 1) / 2 frames relative to group center float realInd = (float)(frame - (mGroupSize - 1.) / 2.); float frac; int ind; if (mGroupSize == 1 || (int)mXshifts[filt].size() < 2) { shiftX = mXshifts[filt][frame]; shiftY = mYshifts[filt][frame]; } else if (realInd < -1) { shiftX = mXshifts[filt][0]; shiftY = mYshifts[filt][0]; } else if (realInd > (int)mXshifts[filt].size()) { shiftX = mXshifts[filt].back(); shiftY = mYshifts[filt].back(); } else { // Allow a bit of extrapolation. The (int)mXshifts is crucial here for comparisons ind = (int)floor(realInd); B3DCLAMP(ind, 0, (int)mXshifts[filt].size() - 2); frac = realInd - ind; shiftX = (1.f - frac) * mXshifts[filt][ind] + frac * mXshifts[filt][ind + 1]; shiftY = (1.f - frac) * mYshifts[filt][ind] + frac * mYshifts[filt][ind + 1]; } } void FrameAlign::getAllFrameShifts(FloatVec &frameXshift, FloatVec &frameYshift, int useFilt) { frameXshift.resize(mNumFrames); frameYshift.resize(mNumFrames); for (int ind = 0; ind < mNumFrames; ind++) frameShiftFromGroups(ind, useFilt, frameXshift[ind], frameYshift[ind]); } /* * Do a spline smoothing of the shifts and return the smoothed coordinates and a distance */ int FrameAlign::splineSmooth(float *xShifts, float *yShifts, int numShifts, float *smoothedX, float *smoothedY, float &splineDist) { int ind, ierr, mOrder = 2, mode = 2; double xVar, yVar, variance = .5; float delx, dely; double *allWork = B3DMALLOC(double, 6 * (numShifts * mOrder + 1) + numShifts + 5 * numShifts + 10); double *dXshifts = allWork; double *dYshifts = dXshifts + numShifts; double *orderedX = dYshifts + numShifts; double *weights = orderedX + numShifts; double *coeff = weights + numShifts; double *work = coeff + numShifts; if (!allWork) return -1; splineDist = 0.; for (ind = 0; ind < numShifts; ind++) { dXshifts[ind] = xShifts[ind]; dYshifts[ind] = yShifts[ind]; orderedX[ind] = ind; weights[ind] = 1.; } // Fit to X then get spline values gcvspl(orderedX, dXshifts, numShifts, weights, weights, mOrder, numShifts, 1, mode, variance, coeff, numShifts, work, &ierr); if (ierr) { free(allWork); return ierr; } xVar = work[4]; for (ind = 0; ind < numShifts; ind++) { ierr = ind; smoothedX[ind] = (float)splder(0, mOrder, numShifts, (double)ind, orderedX, coeff, &ierr, work); } // Fit to Y gcvspl(orderedX, dYshifts, numShifts, weights, weights, mOrder, numShifts, 1, mode, variance, coeff, numShifts, work, &ierr); if (ierr) { free(allWork); return ierr; } yVar = work[4]; for (ind = 0; ind < numShifts; ind++) { ierr = ind; smoothedY[ind] = (float)splder(0, mOrder, numShifts, (double)ind, orderedX, coeff, &ierr, work); if (ind) { delx = smoothedX[ind] - smoothedX[ind - 1]; dely = smoothedY[ind] - smoothedY[ind - 1]; splineDist += sqrt(delx * delx + dely * dely); } } if (mDebug) utilPrint("GCV variance estimates = %f %f\n", xVar, yVar); return 0; } /* * Find some crossings of the FRC and the level at half-nyquist */ void FrameAlign::analyzeFRCcrossings(float *ringCorrs, float frcDeltaR, float &halfCross, float &quartCross, float &eighthCross,float &halfNyq) { int cenBin, numBins, ind; halfCross = 0.; quartCross = 0.; eighthCross = 0.; for (ind = 1; ind < (int)floor(0.5 / frcDeltaR); ind++) { if (!halfCross && ringCorrs[ind - 1] >= 0.5 && ringCorrs[ind] <= 0.5) halfCross =(float)(frcDeltaR * (ind - 0.5 + (ringCorrs[ind - 1] - 0.5) / (ringCorrs[ind - 1] - ringCorrs[ind]))); if (!quartCross && ringCorrs[ind - 1] >= 0.25 && ringCorrs[ind] <= 0.25) quartCross = (float)(frcDeltaR * (ind - 0.5 + (ringCorrs[ind - 1] - 0.25) / (ringCorrs[ind - 1] - ringCorrs[ind]))); if (!eighthCross && ringCorrs[ind - 1] >= 0.125 && ringCorrs[ind] <= 0.125) { eighthCross = (float)(frcDeltaR * (ind - 0.5 + (ringCorrs[ind - 1] - 0.125) / (ringCorrs[ind - 1] - ringCorrs[ind]))); } } cenBin = B3DNINT(0.25 / frcDeltaR - 0.5); numBins = B3DMAX(1, B3DNINT(0.075 / frcDeltaR)); halfNyq = 0.; for (ind = 0; ind < numBins; ind++) halfNyq += (float)(ringCorrs[ind + cenBin - numBins / 2] / numBins); } /* * Apply filter to image and add it to the sum; this is a simplified form of * XcorrFilterPart and saves the time of zeroing regions where the filter is zero, plus * saving the second pass through to add to sum */ void FrameAlign::filterAndAddToSum(float *fft, float *array, int nx, int ny, float *ctf, float delta) { float x = 0., delx, dely, y = 0., s = 0., maxFreq; double ysq = 0.; int ix, iy, index = 0, ind = 0, indp1 = 0, indf = 0, nxDiv2, nxDiv2p1, nyMinus1; int nxMax; int numThreads, maxThreads = 16; nxDiv2 = nx / 2; nxDiv2p1 = nxDiv2 + 1; nyMinus1 = ny - 1; delx = (float)(1.0 / nx); dely = (float)(1.0 / ny); /* Find last non-zero filter value in range that matters */ for (ix = (int)(0.707 / delta); ix > 1; ix--) if (ctf[ix]) break; /* Get a frequency limit to apply in Y and a limit to X indexes */ maxFreq = (ix + 1) * delta; nxMax = (int)(maxFreq / delx) + 1; B3DCLAMP(nxMax, 1, nxDiv2); /* This formula give 1.5+ at 128 and 11.5+ at 4096 */ numThreads = B3DNINT(3.33 * (log10((double)nx * ny) - 3.75)); B3DCLAMP(numThreads, 1, maxThreads); numThreads = numOMPthreads(numThreads); /* apply filter function on fft, put result in array */ #pragma omp parallel for num_threads(numThreads) \ shared(nyMinus1, dely, maxFreq, nxDiv2, array, nxMax, delta, fft, nxDiv2p1) \ private(iy, y, ix, ind, index, ysq, x, indp1, s, indf) for (iy = 0; iy <= nyMinus1; iy++) { y = iy * dely; index = iy * nxDiv2p1; if (y > 0.5) y = 1.0f - y; if (y > maxFreq) continue; ysq = y * y; x = 0.0; for (ix = 0; ix <= nxMax; ix++) { ind = 2 * (index + ix); indp1 = ind + 1; s = (float)sqrt(x * x + ysq); indf = (int)(s / delta + 0.5f); array[ind] += fft[ind] * ctf[indf]; array[indp1] += fft[indp1] * ctf[indf]; x = x + delx; } } } ///////////////////////////////////// // Routines for evaluating memory and capabilities allowed by it, needed in alignframes // and SerialEM. SerialEM does not instantiate the class for K2, so these are statics ///////////////////////////////////// /* * Get sizes in bytes for full, sum, and align images */ void FrameAlign::getPadSizesBytes(int nx, int ny, float fullTaperFrac, int sumBin, int alignBin, float &fullPadSize, float &sumPadSize, float &alignPadSize) { fullPadSize = 4.f * (1.f + 2.f * fullTaperFrac) * (1.f + 2.f * fullTaperFrac) * nx * ny; sumPadSize = fullPadSize / (sumBin * sumBin); alignPadSize = (float)(4. * nx * ny) / ((float)alignBin * alignBin); } /* * Return the memory needed for aligning and summing on the GPU based on the conditions, * in bytes */ void FrameAlign::gpuMemoryNeeds(float fullPadSize, float sumPadSize, float alignPadSize, int numAllVsAll, int nzAlign, int refineAtEnd, int groupSize, float &needForGpuSum, float &needForGpuAli) { int numHoldAlign; // The FFTs require TWICE as much memory as the image, so allow for the biggest needForGpuSum = fullPadSize + sumPadSize + 2.f * B3DMAX(fullPadSize, sumPadSize); // Get needs for aligning on GPU // Simple cumulative sum needs all 5 arrays and FFT work area // Refining needs all arrays too plus one for each frame // Otherwise only 3 arrays and FFT and number of rolling frames if (!numAllVsAll && !refineAtEnd) numHoldAlign = 6; else if (refineAtEnd) numHoldAlign = 6 + nzAlign; else numHoldAlign = 4 + B3DMIN(numAllVsAll, nzAlign); if (groupSize > 1) { // If doing groups, refinement requires space for the group sums, and // no refinement requires space for groupSize single frames if (refineAtEnd) numHoldAlign += B3DMIN(numAllVsAll, nzAlign); else numHoldAlign += groupSize; } needForGpuAli = numHoldAlign * alignPadSize; } /* * Return the computer memory needed based on the conditions, in gigabytes, and set * flag for whether sums can be made in one pass, and return maximum number of full frames * that need to be held in memory somewhere */ float FrameAlign::totalMemoryNeeds(float fullPadSize, int fullDataSize, float sumPadSize, float alignPadSize, int numAllVsAll, int nzAlign, int refineAtEnd, int numBinTests, int numFiltTests, int hybridShifts, int groupSize, int doSpline, int gpuFlags, int deferSum, int testMode, int startAssess, bool &sumInOnePass, int &numHoldFull) { int numHoldAlign, stackLimit; float memTot; // Make sums as you go if there is one binning and no subset assessment sumInOnePass = numBinTests == 1 && !testMode && startAssess < 0; // Initialize each hold to numAVA then adjust align holding needed numHoldFull = numHoldAlign = B3DMIN(numAllVsAll, nzAlign); if (numAllVsAll) { if (refineAtEnd) numHoldAlign = nzAlign; if (groupSize > 1 && refineAtEnd) numHoldAlign += B3DMIN(numAllVsAll, nzAlign); else if (groupSize > 1) numHoldAlign += groupSize; } else if (refineAtEnd) { numHoldAlign = nzAlign; } if (gpuFlags & GPU_FOR_ALIGNING) numHoldAlign = 0; // Adjust full holding needed and reduce by GPU stack if any if ((!hybridShifts && numFiltTests > 1 && sumInOnePass && numAllVsAll) || refineAtEnd || deferSum || doSpline) numHoldFull = nzAlign; if (gpuFlags & STACK_FULL_ON_GPU) { if (gpuFlags & GPU_STACK_LIMITED) { stackLimit = (gpuFlags >> GPU_STACK_LIM_SHIFT) & GPU_STACK_LIM_MASK; numHoldFull = B3DMAX(0, numHoldFull - stackLimit); } else { numHoldFull = 0; } } if ((numBinTests > 1 || testMode) && numAllVsAll && startAssess < 0) numHoldFull = 0; memTot = (float)((2. * sumPadSize + (numHoldAlign + 4.) * alignPadSize + fullPadSize + numHoldFull * fullDataSize * fullPadSize / 4.) / (1024. * 1024. * 1024.)); return memTot; } /* * Given the parameters for what processing is needed and how many frames need to be * held in a stack, try to add noise pad then bin pad on the GPU, trying first with * preprocessing on GPU because that is a preferable endpoint and allows a smaller stack * Memory is in bytes for GPU */ float FrameAlign::findPreprocPadGpuFlags(int unpaddedX, int unpaddedY, int dataSize, int binning, bool hasGain, bool hasDefect, bool hasTrunc, int numExpected, float freeMem, float stackMargin, int inFlags, int &outFlags) { bool needsProc = hasGain || hasDefect || hasTrunc; bool doAlign = (inFlags & GPU_FOR_ALIGNING) != 0; bool doSum = (inFlags & GPU_FOR_SUMMING) != 0; int passSize, maxAdded; int stackLimit = 0; float tempNeeds, unpadBytes; int preprocFlags = GPU_DO_PREPROCESS | (hasGain ? GPU_DO_GAIN_NORM : 0) | (hasDefect ? GPU_CORRECT_DEFECTS : 0); outFlags = inFlags; if (doAlign && !doSum) { // Align only is separate if (!needsProc || !preprocPadGpuMemoryFits(unpaddedX, unpaddedY, dataSize, binning, hasGain, hasDefect, hasTrunc, true, false, true, freeMem, tempNeeds)) { // No proc or proc does not fit, evaluate just binPad if (preprocPadGpuMemoryFits(unpaddedX, unpaddedY, dataSize, binning, hasGain, hasDefect, hasTrunc, false, false, true, freeMem, tempNeeds)) { outFlags |= GPU_DO_BIN_PAD; return tempNeeds; } return 0.; } else { // Proc and binpad fit outFlags |= GPU_DO_BIN_PAD | preprocFlags; return tempNeeds; } } // Align and sum or sum only: first evaluate noise and preproc if (!needsProc || !preprocPadGpuMemoryFits(unpaddedX, unpaddedY, dataSize, binning, hasGain, hasDefect, hasTrunc, true, true, false, freeMem, tempNeeds)) { // No proc, or proc and noise do not fit, evaluate just noise if (preprocPadGpuMemoryFits(unpaddedX, unpaddedY, dataSize, binning, hasGain, hasDefect, hasTrunc, false, true, false, freeMem, tempNeeds)) { // Noise fits without preproc, try binpad too if (doAlign && preprocPadGpuMemoryFits(unpaddedX, unpaddedY, dataSize, binning, hasGain, hasDefect, hasTrunc, false, true, true, freeMem, tempNeeds)) { // Noise and bin pad fit, set flags and fall through to stack testing outFlags |= GPU_DO_BIN_PAD | GPU_DO_NOISE_TAPER; } else { // Only noise fits or is needed, no bin pad or preproc outFlags |= GPU_DO_NOISE_TAPER; return tempNeeds; } } else { // Even noise does not fit return 0.; } } else { // Preproc and noise fit, evaluate binpad if (doAlign && preprocPadGpuMemoryFits(unpaddedX, unpaddedY, dataSize, binning, hasGain, hasDefect, hasTrunc, true, true, true, freeMem, tempNeeds)) { // Proc and binpad fit with noise, do it all and fall through to stack outFlags |= GPU_DO_BIN_PAD | GPU_DO_NOISE_TAPER | preprocFlags; } else { // Preproc and only noise fit or are needed, set flags outFlags |= GPU_DO_NOISE_TAPER | preprocFlags; return tempNeeds; } } // Now see if a stack can fit and how much // One frame on stack replaces the temp raw array, hence compute added number amd add 1 passSize = (needsProc && !(outFlags & preprocFlags)) ? sizeof(float) : dataSize; unpadBytes = (float)(passSize * unpaddedX * unpaddedY); maxAdded = (int)((freeMem - tempNeeds - stackMargin) / unpadBytes); if (maxAdded >= numExpected - 1) { tempNeeds += (numExpected - 1) * unpadBytes; outFlags |= STACK_FULL_ON_GPU; } else if (maxAdded > 0) { stackLimit = maxAdded + 1; tempNeeds += (float)maxAdded * unpadBytes; outFlags |= STACK_FULL_ON_GPU | GPU_STACK_LIMITED | (stackLimit << GPU_STACK_LIM_SHIFT); } return tempNeeds; } /* * For one set of possible operations on GPU, determine if the needed memory fits * within freeMem, return needed amount in bytes in needsMem */ bool FrameAlign::preprocPadGpuMemoryFits(int unpaddedX, int unpaddedY, int dataSize, int binning, bool hasGain, bool hasDefect, bool hasTrunc, bool doPreProc, bool doNoise, bool doBinPad, float freeMem, float &needsMem) { bool needsProc = hasGain || hasDefect || hasTrunc; int passSize = (needsProc && !doPreProc) ? sizeof(float) : dataSize; float unpadBytes = (float)(unpaddedX * unpaddedY); needsMem = 0.; // A gain reference needs a float array, defects need a byte array, noise or bin pad // needs the raw temp array if not stacking, binning needs the reduced in X float array if (doPreProc && hasGain) needsMem += 4.f * unpadBytes; if (doPreProc && hasDefect) needsMem += unpadBytes; if (doNoise || doBinPad) needsMem += (float)passSize * unpadBytes; if (doBinPad && binning > 1) needsMem += 4.f * unpadBytes / (float)binning; return needsMem < freeMem; } /* * Set the truncation limit to use: pass it on if >= 0, or find mean/sd and get threshold * if it is negative # of SD's. This is here for convenience of framealign callers * Returns 1 for error making line pointers, or 2 for error getting sample arrays */ int FrameAlign::setTruncationLimit(void *array, int nx, int ny, int useMode, float truncLimit, float &truncUse) { int err, type = typeForSampleMean(useMode); int dataSize, temp; int numSample = 20000; unsigned char **linePtrs; float sample, mean, sd; int trim = B3DMIN(nx, ny) / 20; int nxUse = nx - 2 * trim; int nyUse = ny - 2 * trim; truncUse = truncLimit; if (truncLimit >= 0.) return 0; dataSizeForMode(useMode, &dataSize, &temp); linePtrs = makeLinePointers(array, nx, ny, dataSize); if (!linePtrs) return 1; sample = B3DMIN(numSample, nxUse * nyUse) / (float)(nxUse * nyUse); err = sampleMeanSD(linePtrs, type, nx, ny, sample, trim, trim, nxUse, nyUse, &mean, &sd); free(linePtrs); if (err) return 2; truncUse = mean - truncLimit * sd; return 0; }