@@ -11,97 +11,97 @@ namespace cufinufft {
1111namespace deconvolve {
1212/* Kernel for copying fw to fk with amplication by prefac/ker */
1313// Note: assume modeord=0: CMCL-compatible mode ordering in fk (from -N/2 up
14- // to N/2-1)
14+ // to N/2-1), modeord=1: FFT-compatible mode ordering in fk (from 0 to N/2-1, then -N/2 up to -1).
1515template <typename T>
16- __global__ void deconvolve_1d (int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1) {
16+ __global__ void deconvolve_1d (int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1, int modeord ) {
1717 for (int i = blockDim .x * blockIdx .x + threadIdx .x ; i < ms; i += blockDim .x * gridDim .x ) {
18- int w1 = i - ms / 2 >= 0 ? i - ms / 2 : nf1 + i - ms / 2 ;
18+ int w1 = ( modeord == 0 ) ? ( ( i - ms / 2 >= 0 ) ? i - ms / 2 : nf1 + i - ms / 2 ) : ( (i - ms + ms / 2 >= 0 ) ? nf1 + i - ms : i ) ;
1919
20- T kervalue = fwkerhalf1[abs (i - ms / 2 )];
20+ T kervalue = fwkerhalf1[(modeord== 0 ) ? abs (i - ms / 2 ) : ((i - ms + ms / 2 >= 0 ) ? ms - i : i )];
2121 fk[i].x = fw[w1].x / kervalue;
2222 fk[i].y = fw[w1].y / kervalue;
2323 }
2424}
2525
2626template <typename T>
2727__global__ void deconvolve_2d (int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
28- T *fwkerhalf2) {
28+ T *fwkerhalf2, int modeord ) {
2929 for (int i = blockDim .x * blockIdx .x + threadIdx .x ; i < ms * mt; i += blockDim .x * gridDim .x ) {
3030 int k1 = i % ms;
3131 int k2 = i / ms;
3232 int outidx = k1 + k2 * ms;
33- int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2 ;
34- int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2 ;
33+ int w1 = ( modeord == 0 ) ? ( ( k1 - ms / 2 >= 0 ) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0 ) ? nf1 + k1 - ms : k1 ) ;
34+ int w2 = ( modeord == 0 ) ? ( ( k2 - mt / 2 >= 0 ) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0 ) ? nf2 + k2 - mt : k2 ) ;
3535 int inidx = w1 + w2 * nf1;
3636
37- T kervalue = fwkerhalf1[abs (k1 - ms / 2 )] * fwkerhalf2[abs (k2 - mt / 2 )];
37+ T kervalue = fwkerhalf1[(modeord== 0 ) ? abs (k1 - ms / 2 ) : ((k1 - ms + ms / 2 >= 0 ) ? ms - k1 : k1) ] * fwkerhalf2[(modeord== 0 ) ? abs (k2 - mt / 2 ) : ((k2 - mt + mt / 2 >= 0 ) ? mt - k2 : k2 )];
3838 fk[outidx].x = fw[inidx].x / kervalue;
3939 fk[outidx].y = fw[inidx].y / kervalue;
4040 }
4141}
4242
4343template <typename T>
4444__global__ void deconvolve_3d (int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex<T> *fw,
45- cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3) {
45+ cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord ) {
4646 for (int i = blockDim .x * blockIdx .x + threadIdx .x ; i < ms * mt * mu; i += blockDim .x * gridDim .x ) {
4747 int k1 = i % ms;
4848 int k2 = (i / ms) % mt;
4949 int k3 = (i / ms / mt);
5050 int outidx = k1 + k2 * ms + k3 * ms * mt;
51- int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2 ;
52- int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2 ;
53- int w3 = k3 - mu / 2 >= 0 ? k3 - mu / 2 : nf3 + k3 - mu / 2 ;
51+ int w1 = ( modeord == 0 ) ? ( ( k1 - ms / 2 >= 0 ) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0 ) ? nf1 + k1 - ms : k1 ) ;
52+ int w2 = ( modeord == 0 ) ? ( ( k2 - mt / 2 >= 0 ) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0 ) ? nf2 + k2 - mt : k2 ) ;
53+ int w3 = ( modeord == 0 ) ? ( ( k3 - mu / 2 >= 0 ) ? k3 - mu / 2 : nf3 + k3 - mu / 2 ) : ( (k3 - mu + mu / 2 >= 0 ) ? nf3 + k3 - mu : k3 ) ;
5454 int inidx = w1 + w2 * nf1 + w3 * nf1 * nf2;
5555
56- T kervalue = fwkerhalf1[abs (k1 - ms / 2 )] * fwkerhalf2[abs (k2 - mt / 2 )] * fwkerhalf3[abs (k3 - mu / 2 )];
56+ T kervalue = fwkerhalf1[(modeord== 0 ) ? abs (k1 - ms / 2 ) : ((k1 - ms + ms / 2 >= 0 ) ? ms - k1 : k1) ] * fwkerhalf2[(modeord== 0 ) ? abs (k2 - mt / 2 ) : ((k2 - mt + mt / 2 >= 0 ) ? mt - k2 : k2) ] * fwkerhalf3[(modeord== 0 ) ? abs (k3 - mu / 2 ) : ((k3 - mu + mu / 2 >= 0 ) ? mu - k3 : k3 )];
5757 fk[outidx].x = fw[inidx].x / kervalue;
5858 fk[outidx].y = fw[inidx].y / kervalue;
5959 }
6060}
6161
6262/* Kernel for copying fk to fw with same amplication */
6363template <typename T>
64- __global__ void amplify_1d (int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1) {
64+ __global__ void amplify_1d (int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1, int modeord ) {
6565 for (int i = blockDim .x * blockIdx .x + threadIdx .x ; i < ms; i += blockDim .x * gridDim .x ) {
66- int w1 = i - ms / 2 >= 0 ? i - ms / 2 : nf1 + i - ms / 2 ;
66+ int w1 = ( modeord == 0 ) ? ( ( i - ms / 2 >= 0 ) ? i - ms / 2 : nf1 + i - ms / 2 ) : ( (i - ms + ms / 2 >= 0 ) ? nf1 + i - ms : i ) ;
6767
68- T kervalue = fwkerhalf1[abs (i - ms / 2 )];
68+ T kervalue = fwkerhalf1[(modeord== 0 ) ? abs (i - ms / 2 ) : ((i - ms + ms / 2 >= 0 ) ? ms - i : i )];
6969 fw[w1].x = fk[i].x / kervalue;
7070 fw[w1].y = fk[i].y / kervalue;
7171 }
7272}
7373
7474template <typename T>
7575__global__ void amplify_2d (int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
76- T *fwkerhalf2) {
76+ T *fwkerhalf2, int modeord ) {
7777 for (int i = blockDim .x * blockIdx .x + threadIdx .x ; i < ms * mt; i += blockDim .x * gridDim .x ) {
7878 int k1 = i % ms;
7979 int k2 = i / ms;
8080 int inidx = k1 + k2 * ms;
81- int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2 ;
82- int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2 ;
81+ int w1 = ( modeord == 0 ) ? ( ( k1 - ms / 2 >= 0 ) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0 ) ? nf1 + k1 - ms : k1 ) ;
82+ int w2 = ( modeord == 0 ) ? ( ( k2 - mt / 2 >= 0 ) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0 ) ? nf2 + k2 - mt : k2 ) ;
8383 int outidx = w1 + w2 * nf1;
8484
85- T kervalue = fwkerhalf1[abs (k1 - ms / 2 )] * fwkerhalf2[abs (k2 - mt / 2 )];
85+ T kervalue = fwkerhalf1[(modeord== 0 ) ? abs (k1 - ms / 2 ) : ((k1 - ms + ms / 2 >= 0 ) ? ms - k1 : k1) ] * fwkerhalf2[(modeord== 0 ) ? abs (k2 - mt / 2 ) : ((k2 - mt + mt / 2 >= 0 ) ? mt - k2 : k2 )];
8686 fw[outidx].x = fk[inidx].x / kervalue;
8787 fw[outidx].y = fk[inidx].y / kervalue;
8888 }
8989}
9090
9191template <typename T>
9292__global__ void amplify_3d (int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex<T> *fw, cuda_complex<T> *fk,
93- T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3) {
93+ T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord ) {
9494 for (int i = blockDim .x * blockIdx .x + threadIdx .x ; i < ms * mt * mu; i += blockDim .x * gridDim .x ) {
9595 int k1 = i % ms;
9696 int k2 = (i / ms) % mt;
9797 int k3 = (i / ms / mt);
9898 int inidx = k1 + k2 * ms + k3 * ms * mt;
99- int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2 ;
100- int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2 ;
101- int w3 = k3 - mu / 2 >= 0 ? k3 - mu / 2 : nf3 + k3 - mu / 2 ;
99+ int w1 = ( modeord == 0 ) ? ( ( k1 - ms / 2 >= 0 ) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0 ) ? nf1 + k1 - ms : k1 ) ;
100+ int w2 = ( modeord == 0 ) ? ( ( k2 - mt / 2 >= 0 ) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0 ) ? nf2 + k2 - mt : k2 ) ;
101+ int w3 = ( modeord == 0 ) ? ( ( k3 - mu / 2 >= 0 ) ? k3 - mu / 2 : nf3 + k3 - mu / 2 ) : ( (k3 - mu + mu / 2 >= 0 ) ? nf3 + k3 - mu : k3 ) ;
102102 int outidx = w1 + w2 * nf1 + w3 * nf1 * nf2;
103103
104- T kervalue = fwkerhalf1[abs (k1 - ms / 2 )] * fwkerhalf2[abs (k2 - mt / 2 )] * fwkerhalf3[abs (k3 - mu / 2 )];
104+ T kervalue = fwkerhalf1[(modeord== 0 ) ? abs (k1 - ms / 2 ) : ((k1 - ms + ms / 2 >= 0 ) ? ms - k1 : k1) ] * fwkerhalf2[(modeord== 0 ) ? abs (k2 - mt / 2 ) : ((k2 - mt + mt / 2 >= 0 ) ? mt - k2 : k2) ] * fwkerhalf3[(modeord== 0 ) ? abs (k3 - mu / 2 ) : ((k3 - mu + mu / 2 >= 0 ) ? mu - k3 : k3 )];
105105 fw[outidx].x = fk[inidx].x / kervalue;
106106 fw[outidx].y = fk[inidx].y / kervalue;
107107 }
@@ -125,13 +125,13 @@ int cudeconvolve1d(cufinufft_plan_t<T> *d_plan, int blksize)
125125 if (d_plan->spopts .spread_direction == 1 ) {
126126 for (int t = 0 ; t < blksize; t++) {
127127 deconvolve_1d<<<(nmodes + 256 - 1 ) / 256 , 256 , 0 , stream>>> (ms, nf1, d_plan->fw + t * nf1,
128- d_plan->fk + t * nmodes, d_plan->fwkerhalf1 );
128+ d_plan->fk + t * nmodes, d_plan->fwkerhalf1 , d_plan-> opts . modeord );
129129 }
130130 } else {
131131 checkCudaErrors (cudaMemsetAsync (d_plan->fw , 0 , maxbatchsize * nf1 * sizeof (cuda_complex<T>), stream));
132132 for (int t = 0 ; t < blksize; t++) {
133133 amplify_1d<<<(nmodes + 256 - 1 ) / 256 , 256 , 0 , stream>>> (ms, nf1, d_plan->fw + t * nf1,
134- d_plan->fk + t * nmodes, d_plan->fwkerhalf1 );
134+ d_plan->fk + t * nmodes, d_plan->fwkerhalf1 , d_plan-> opts . modeord );
135135 }
136136 }
137137 return 0 ;
@@ -158,14 +158,14 @@ int cudeconvolve2d(cufinufft_plan_t<T> *d_plan, int blksize)
158158 for (int t = 0 ; t < blksize; t++) {
159159 deconvolve_2d<<<(nmodes + 256 - 1 ) / 256 , 256 , 0 , stream>>> (ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2,
160160 d_plan->fk + t * nmodes, d_plan->fwkerhalf1 ,
161- d_plan->fwkerhalf2 );
161+ d_plan->fwkerhalf2 , d_plan-> opts . modeord );
162162 }
163163 } else {
164164 checkCudaErrors (cudaMemsetAsync (d_plan->fw , 0 , maxbatchsize * nf1 * nf2 * sizeof (cuda_complex<T>), stream));
165165 for (int t = 0 ; t < blksize; t++) {
166166 amplify_2d<<<(nmodes + 256 - 1 ) / 256 , 256 , 0 , stream>>> (ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2,
167167 d_plan->fk + t * nmodes, d_plan->fwkerhalf1 ,
168- d_plan->fwkerhalf2 );
168+ d_plan->fwkerhalf2 , d_plan-> opts . modeord );
169169 }
170170 }
171171 return 0 ;
@@ -193,15 +193,15 @@ int cudeconvolve3d(cufinufft_plan_t<T> *d_plan, int blksize)
193193 for (int t = 0 ; t < blksize; t++) {
194194 deconvolve_3d<<<(nmodes + 256 - 1 ) / 256 , 256 , 0 , stream>>> (
195195 ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3, d_plan->fk + t * nmodes,
196- d_plan->fwkerhalf1 , d_plan->fwkerhalf2 , d_plan->fwkerhalf3 );
196+ d_plan->fwkerhalf1 , d_plan->fwkerhalf2 , d_plan->fwkerhalf3 , d_plan-> opts . modeord );
197197 }
198198 } else {
199199 checkCudaErrors (
200200 cudaMemsetAsync (d_plan->fw , 0 , maxbatchsize * nf1 * nf2 * nf3 * sizeof (cuda_complex<T>), stream));
201201 for (int t = 0 ; t < blksize; t++) {
202202 amplify_3d<<<(nmodes + 256 - 1 ) / 256 , 256 , 0 , stream>>> (
203203 ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3, d_plan->fk + t * nmodes,
204- d_plan->fwkerhalf1 , d_plan->fwkerhalf2 , d_plan->fwkerhalf3 );
204+ d_plan->fwkerhalf1 , d_plan->fwkerhalf2 , d_plan->fwkerhalf3 , d_plan-> opts . modeord );
205205 }
206206 }
207207 return 0 ;
0 commit comments