Skip to content

Commit aa7b396

Browse files
Liang YuGitHub Enterprise
authored andcommitted
CUDA crossmul upsample fix (#717)
* added non-multilooked case and fixed incorrect row size in device interferogram function call * renamed blockRow variables for clarity * added gpu option in workflow * added missing binding for coherence-less crossmul * fixed missing sum = 0 initializations and /= block size normalizations * added flatten kernel * fixed erroneous multilook set logic * fixed incorrect range offset allocation * split interferogram calculation into with and without upsample * removed non-working range filter; will restore in later commit * fixed incorrect parameter for multilooked amplitudes * added normalization to CUDA multilook power test * add get+set for crossmul oversample * added pybind bindings * added oversample option to InSAR crossmul * compute amplitude from upsampled SLC when coherence calculated * added oversample get/set for CUDA crossmul * fixed incorrect grid size * added general SLC device pointers and fixed oversampled amplitude * fixes and cleanup * rewritten to zero out space between shift * grid size for rangeShiftImpactMult_g corrected * normalization value corrected in upsample * upsampled index calculation corrected in interferogram_g * oversample taken into account in multilook amplitude calculation * use local variable instead of global in kernels * fixed upsample SLC cudaFree * possible thread ID integer overflows addressed with switch to size_t * fixed indexing in interferogram_g * added interferogram_g and multilooks_power_g calls for w/ and w/o oversample * made inputs constant in constant * removed global functions from header * moved shift impact multiplication to own function * corrected CUDA and CPU for range shifting odd signals * chagned casts from float/double to thrust::complex<float/double> * consolidated inplace transforms into single template and minor formatting * corrected use of sizeof for complex types * comment/documentation fixes
1 parent 9ea3583 commit aa7b396

File tree

14 files changed

+331
-355
lines changed

14 files changed

+331
-355
lines changed

cxx/isce3/cuda/signal/gpuCrossMul.cu

Lines changed: 105 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,13 @@
1515
output
1616
thrust::complex *ifgram (n_cols*n_rows)
1717
input
18-
thrust::complex *refSlcUp ((oversample*n_fft)*n_rows)
18+
thrust::complex *refSlcUp ((oversample*n_fft)*n_rows) nfft >= where ncols
1919
thrust::complex *secSlcUp
2020
size_t n_rows
2121
size_t n_cols
2222
size_t n_fft
23-
int oversample_i
24-
float oversampe_f
23+
int oversample_int
24+
float oversample_float
2525
*/
2626
template <typename T>
2727
__global__ void interferogram_g(thrust::complex<T> *ifgram,
@@ -30,23 +30,37 @@ __global__ void interferogram_g(thrust::complex<T> *ifgram,
3030
size_t n_rows,
3131
size_t n_cols,
3232
size_t n_fft,
33-
int oversample_i,
34-
T oversample_f)
33+
int oversample_int,
34+
T oversample_float)
3535
{
36+
// get 1-d interferogram index
3637
const auto i = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
3738

3839
// make sure index within ifgram size bounds
3940
if (i < n_rows * n_cols) {
41+
// break up 1-d index into 2-d index
4042
auto i_row = i / n_cols;
4143
auto i_col = i % n_cols;
4244

43-
ifgram[i] = thrust::complex<T>(0.0, 0.0);
44-
for (int j = 0; j < oversample_i; ++j) {
45-
auto ref_val = refSlcUp[i_row*oversample_i*n_fft + i_col];
46-
auto sec_val_conj = conj(secSlcUp[i_row*oversample_i*n_fft + i_col]);
47-
ifgram[i] += ref_val * sec_val_conj;
45+
// local accumulation variable
46+
auto accumulation = thrust::complex<T>(0.0, 0.0);
47+
48+
// accumulate crossmultiplied oversampled pixels
49+
// oversample_int > 0 so crossmultiply will always be calculated
50+
for (int j = 0; j < oversample_int; ++j) {
51+
// get 1-d, maybe oversampled, index based on 2-d index
52+
// i_row * n_fft + i_col = 1-d index w/o oversampling
53+
// oversample_int * (..) = first 1-d index w/ oversampling
54+
// (...) + j = j-th oversampled index
55+
auto i_up = oversample_int * (i_row * n_fft + i_col) + j;
56+
57+
// get values from SLC rasters and crossmultiply
58+
auto ref_val = refSlcUp[i_up];
59+
auto sec_val_conj = thrust::conj(secSlcUp[i_up]);
60+
accumulation += ref_val * sec_val_conj;
4861
}
49-
ifgram[i] /= oversample_f;
62+
// normalize by oversample factor
63+
ifgram[i] = accumulation / oversample_float;
5064
}
5165
}
5266

@@ -203,7 +217,6 @@ crossmul(isce3::io::Raster& referenceSLC,
203217

204218
}
205219

206-
207220
void isce3::cuda::signal::gpuCrossmul::
208221
crossmul(isce3::io::Raster& referenceSLC,
209222
isce3::io::Raster& secondarySLC,
@@ -277,14 +290,14 @@ crossmul(isce3::io::Raster& referenceSLC,
277290
auto slc_size = n_slc * sizeof(thrust::complex<float>);
278291

279292
// storage for a block of reference SLC data
280-
std::valarray<std::complex<float>> refSlc(n_slc);
281-
thrust::complex<float> *d_refSlc;
282-
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_refSlc), slc_size));
293+
std::valarray<std::complex<float>> refSlcOrig(n_slc);
294+
thrust::complex<float> *d_refSlcOrig;
295+
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_refSlcOrig), slc_size));
283296

284297
// storage for a block of secondary SLC data
285-
std::valarray<std::complex<float>> secSlc(n_slc);
286-
thrust::complex<float> *d_secSlc;
287-
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_secSlc), slc_size));
298+
std::valarray<std::complex<float>> secSlcOrig(n_slc);
299+
thrust::complex<float> *d_secSlcOrig;
300+
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_secSlcOrig), slc_size));
288301

289302
// set upsampled parameters
290303
auto n_slcUpsampled = _oversample * nfft * rowsPerBlock;
@@ -293,13 +306,16 @@ crossmul(isce3::io::Raster& referenceSLC,
293306
// upsampled block of reference SLC
294307
std::valarray<std::complex<float>> refSlcUpsampled(n_slcUpsampled);
295308
thrust::complex<float> *d_refSlcUpsampled;
296-
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_refSlcUpsampled), slcUpsampled_size));
309+
if (_oversample > 1)
310+
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_refSlcUpsampled), slcUpsampled_size));
297311

298312
// upsampled block of secondary SLC
299313
thrust::complex<float> *d_secSlcUpsampled;
300-
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_secSlcUpsampled), slcUpsampled_size));
314+
if (_oversample > 1)
315+
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_secSlcUpsampled), slcUpsampled_size));
301316

302-
// shift impact
317+
// calculate and copy to device shiftImpact frequency responce (a linear phase)
318+
// to a sub-pixel shift in time domain introduced by upsampling followed by downsampling
303319
std::valarray<std::complex<float>> shiftImpact(n_slcUpsampled);
304320
thrust::complex<float> *d_shiftImpact;
305321
lookdownShiftImpact(_oversample,
@@ -349,8 +365,8 @@ crossmul(isce3::io::Raster& referenceSLC,
349365

350366
// determine block layout
351367
dim3 block(THRD_PER_BLOCK);
352-
dim3 grid_hi((refSlc.size()*_oversample+(THRD_PER_BLOCK-1))/THRD_PER_BLOCK);
353-
dim3 grid_reg((refSlc.size()+(THRD_PER_BLOCK-1))/THRD_PER_BLOCK);
368+
dim3 grid_hi((refSlcOrig.size()*_oversample+(THRD_PER_BLOCK-1))/THRD_PER_BLOCK);
369+
dim3 grid_reg((refSlcOrig.size()+(THRD_PER_BLOCK-1))/THRD_PER_BLOCK);
354370
dim3 grid_lo((blockRowsMultiLooked*ncolsMultiLooked+(THRD_PER_BLOCK-1))/THRD_PER_BLOCK);
355371

356372
// configure azimuth filter
@@ -385,8 +401,8 @@ crossmul(isce3::io::Raster& referenceSLC,
385401
}
386402

387403
// fill the valarray with zero before getting the block of the data
388-
refSlc = 0;
389-
secSlc = 0;
404+
refSlcOrig = 0;
405+
secSlcOrig = 0;
390406
refSlcUpsampled = 0;
391407
ifgram = 0;
392408

@@ -397,36 +413,33 @@ crossmul(isce3::io::Raster& referenceSLC,
397413
std::valarray<std::complex<float>> dataLine(ncols);
398414
for (size_t line = 0; line < rowsThisBlock; ++line){
399415
referenceSLC.getLine(dataLine, rowStart + line);
400-
refSlc[std::slice(line*nfft, ncols, 1)] = dataLine;
416+
refSlcOrig[std::slice(line*nfft, ncols, 1)] = dataLine;
401417
secondarySLC.getLine(dataLine, rowStart + line);
402-
secSlc[std::slice(line*nfft, ncols, 1)] = dataLine;
418+
secSlcOrig[std::slice(line*nfft, ncols, 1)] = dataLine;
403419
}
404-
checkCudaErrors(cudaMemcpy(d_refSlc, &refSlc[0], slc_size, cudaMemcpyHostToDevice));
405-
checkCudaErrors(cudaMemcpy(d_secSlc, &secSlc[0], slc_size, cudaMemcpyHostToDevice));
420+
checkCudaErrors(cudaMemcpy(d_refSlcOrig, &refSlcOrig[0], slc_size, cudaMemcpyHostToDevice));
421+
checkCudaErrors(cudaMemcpy(d_secSlcOrig, &secSlcOrig[0], slc_size, cudaMemcpyHostToDevice));
406422

407423
// apply azimuth filter (do inplace)
408424
if (_doCommonAzimuthBandFilter) {
409-
azimuthFilter.filter(d_refSlc);
410-
azimuthFilter.filter(d_secSlc);
425+
azimuthFilter.filter(d_refSlcOrig);
426+
azimuthFilter.filter(d_secSlcOrig);
411427
}
412428

413429
auto oversample_f = static_cast<float>(_oversample);
414430
if (_oversample > 1) {
415431
// upsample reference and secondary. done on device
416432
upsample(signalNoUpsample,
417433
signalUpsample,
418-
d_refSlc,
434+
d_refSlcOrig,
419435
d_refSlcUpsampled,
420436
d_shiftImpact);
421437
upsample(signalNoUpsample,
422438
signalUpsample,
423-
d_secSlc,
439+
d_secSlcOrig,
424440
d_secSlcUpsampled,
425441
d_shiftImpact);
426442

427-
// run kernels to compute oversampled interforgram
428-
// refSignal overwritten with upsampled interferogram
429-
// reduce from nfft*oversample*rowsPerBlock to ncols*rowsPerBlock
430443
interferogram_g<<<grid_reg, block>>>(
431444
d_ifgram,
432445
d_refSlcUpsampled,
@@ -435,8 +448,8 @@ crossmul(isce3::io::Raster& referenceSLC,
435448
} else {
436449
interferogram_g<<<grid_reg, block>>>(
437450
d_ifgram,
438-
d_refSlc,
439-
d_secSlc,
451+
d_refSlcOrig,
452+
d_secSlcOrig,
440453
rowsThisBlock, ncols, nfft, _oversample, oversample_f);
441454
}
442455

@@ -469,7 +482,7 @@ crossmul(isce3::io::Raster& referenceSLC,
469482
d_ifgram,
470483
ncols, // n columns hi res
471484
ncolsMultiLooked, // n cols lo res
472-
_azimuthLooks, // col resize factor of hi to lo
485+
_azimuthLooks, // row resize factor of hi to lo
473486
_rangeLooks, // col resize factor of hi to lo
474487
n_mlook, // number of lo res elements
475488
float(_azimuthLooks*_rangeLooks));
@@ -484,32 +497,58 @@ crossmul(isce3::io::Raster& referenceSLC,
484497
interferogram.setBlock(ifgram_mlook, 0, rowStart/_azimuthLooks,
485498
ncols/_rangeLooks, rowsThisBlock/_azimuthLooks);
486499

487-
// write reduce+abs and set blocks
488-
multilooks_power_g<<<grid_lo, block>>>(
489-
d_ref_amp_mlook,
490-
d_refSlc,
491-
2,
492-
nfft,
493-
ncolsMultiLooked,
494-
_azimuthLooks, // row resize factor of hi to lo
495-
_rangeLooks, // col resize factor of hi to lo
496-
n_mlook, // number of lo res elements
497-
float(_azimuthLooks*_rangeLooks));
500+
if (_oversample > 1) {
501+
// write reduce+abs and set blocks
502+
multilooks_power_g<<<grid_lo, block>>>(
503+
d_ref_amp_mlook,
504+
d_refSlcUpsampled,
505+
2,
506+
_oversample*nfft, // n columns hi res
507+
ncolsMultiLooked, // n columns lo res
508+
_azimuthLooks, // row resize factor of hi to lo
509+
_oversample*_rangeLooks, // col resize factor of hi to lo
510+
n_mlook, // number of lo res elements
511+
float(_oversample*_azimuthLooks*_rangeLooks));
512+
} else {
513+
multilooks_power_g<<<grid_lo, block>>>(
514+
d_ref_amp_mlook,
515+
d_refSlcOrig,
516+
2,
517+
_oversample*nfft, // n columns hi res
518+
ncolsMultiLooked, // n columns lo res
519+
_azimuthLooks, // row resize factor of hi to lo
520+
_oversample*_rangeLooks, // col resize factor of hi to lo
521+
n_mlook, // number of lo res elements
522+
float(_oversample*_azimuthLooks*_rangeLooks));
523+
}
498524

499525
// Check for any kernel errors
500526
checkCudaErrors(cudaPeekAtLastError());
501527
checkCudaErrors(cudaDeviceSynchronize());
502528

503-
multilooks_power_g<<<grid_lo, block>>>(
504-
d_sec_amp_mlook,
505-
d_secSlc,
506-
2,
507-
nfft,
508-
ncolsMultiLooked,
509-
_azimuthLooks, // row resize factor of hi to lo
510-
_rangeLooks, // col resize factor of hi to lo
511-
n_mlook, // number of lo res elements
512-
float(_azimuthLooks*_rangeLooks));
529+
if (_oversample > 1) {
530+
multilooks_power_g<<<grid_lo, block>>>(
531+
d_sec_amp_mlook,
532+
d_secSlcUpsampled,
533+
2,
534+
_oversample*nfft,
535+
ncolsMultiLooked,
536+
_azimuthLooks, // row resize factor of hi to lo
537+
_oversample*_rangeLooks, // col resize factor of hi to lo
538+
n_mlook, // number of lo res elements
539+
float(_oversample*_azimuthLooks*_rangeLooks));
540+
} else {
541+
multilooks_power_g<<<grid_lo, block>>>(
542+
d_sec_amp_mlook,
543+
d_secSlcOrig,
544+
2,
545+
_oversample*nfft,
546+
ncolsMultiLooked,
547+
_azimuthLooks, // row resize factor of hi to lo
548+
_oversample*_rangeLooks, // col resize factor of hi to lo
549+
n_mlook, // number of lo res elements
550+
float(_oversample*_azimuthLooks*_rangeLooks));
551+
}
513552

514553
// Check for any kernel errors
515554
checkCudaErrors(cudaPeekAtLastError());
@@ -543,10 +582,12 @@ crossmul(isce3::io::Raster& referenceSLC,
543582
}
544583

545584
// liberate all device memory
546-
checkCudaErrors(cudaFree(d_refSlc));
547-
checkCudaErrors(cudaFree(d_secSlc));
548-
checkCudaErrors(cudaFree(d_refSlcUpsampled));
549-
checkCudaErrors(cudaFree(d_secSlcUpsampled));
585+
checkCudaErrors(cudaFree(d_refSlcOrig));
586+
checkCudaErrors(cudaFree(d_secSlcOrig));
587+
if (_oversample > 1) {
588+
checkCudaErrors(cudaFree(d_refSlcUpsampled));
589+
checkCudaErrors(cudaFree(d_secSlcUpsampled));
590+
}
550591
checkCudaErrors(cudaFree(d_shiftImpact));
551592
checkCudaErrors(cudaFree(d_ifgram));
552593
if (_doCommonRangeBandFilter) {

cxx/isce3/cuda/signal/gpuCrossMul.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,12 @@ class isce3::cuda::signal::gpuCrossmul {
9898
/** Get common range band filtering flag */
9999
inline bool doCommonRangeBandFilter() const {return _doCommonRangeBandFilter;};
100100

101+
/** Set oversample*/
102+
inline void oversample(size_t v) {_oversample = v;};
103+
104+
/** Get oversample*/
105+
inline size_t oversample() const {return _oversample;};
106+
101107
private:
102108
//Doppler LUT for the refernce SLC
103109
isce3::core::LUT1d<double> _refDoppler;

0 commit comments

Comments
 (0)