Merge pull request #447 from lu1and10/cufinufft-modeord

ahbarnett · web-flow · commit 7535a82d522b · 2024-06-04T14:48:31.000-04:00
cufinufft modeord
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,6 +1,8 @@
 List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
+* cufinufft now supports modeord(type 1,2 only): 0 CMCL-style increasing mode
+  order, 1 FFT-style mode order.
 * New doc page: migration guide from NFFT3 (2d1 case only).
 * New foldrescale, removes [-3pi,3pi) restriction on NU points, and slight
   speedup at large tols. Deprecates both opts.chkbnds and error code
diff --git a/include/cufinufft/cudeconvolve.h b/include/cufinufft/cudeconvolve.h
@@ -5,29 +5,29 @@
 
 namespace cufinufft {
 namespace deconvolve {
-template <typename T>
+template <typename T, int modeord>
 __global__ void deconvolve_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1);
-template <typename T>
+template <typename T, int modeord>
 __global__ void amplify_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf2);
-template <typename T>
+template <typename T, int modeord>
 __global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk,
                               T *fwkerhalf1, T *fwkerhalf2);
-template <typename T>
+template <typename T, int modeord>
 __global__ void amplify_2d(int ms, int mt, int nf1, int nf2, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk,
                            T *fwkerhalf1, T *fwkerhalf2);
 
-template <typename T>
+template <typename T, int modeord>
 __global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, cuda_complex<T> *fw,
                               cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3);
-template <typename T>
+template <typename T, int modeord>
 __global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, cuda_complex<T> *fw,
                            cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3);
 
-template <typename T>
+template <typename T, int modeord>
 int cudeconvolve1d(cufinufft_plan_t<T> *d_mem, int blksize);
-template <typename T>
+template <typename T, int modeord>
 int cudeconvolve2d(cufinufft_plan_t<T> *d_mem, int blksize);
-template <typename T>
+template <typename T, int modeord>
 int cudeconvolve3d(cufinufft_plan_t<T> *d_mem, int blksize);
 } // namespace deconvolve
 } // namespace cufinufft
diff --git a/include/cufinufft_opts.h b/include/cufinufft_opts.h
@@ -26,6 +26,9 @@ typedef struct cufinufft_opts { // see cufinufft_default_opts() for defaults
     int gpu_device_id;
 
     void *gpu_stream;
+
+    int modeord; // (type 1,2 only): 0 CMCL-style increasing mode order
+                 //                  1 FFT-style mode order
 } cufinufft_opts;
 
 #endif
diff --git a/python/cufinufft/cufinufft/_cufinufft.py b/python/cufinufft/cufinufft/_cufinufft.py
@@ -63,7 +63,8 @@ def _get_NufftOpts():
         ('gpu_spreadinterponly', c_int),
         ('gpu_maxbatchsize', c_int),
         ('gpu_device_id', c_int),
-        ('gpu_stream', c_void_p)
+        ('gpu_stream', c_void_p),
+        ('modeord', c_int)
     ]
     return fields
 
diff --git a/python/cufinufft/cufinufft/_plan.py b/python/cufinufft/cufinufft/_plan.py
@@ -65,7 +65,9 @@ class Plan:
                         memory), ``gpu_sort`` (for ``gpu_method == 1``, 0: no
                         sort, 1: sort), ``gpu_kerevalmeth`` (0: direct
                         exp(sqrt), Horner evaluation), ``gpu_device_id`` (GPU
-                        ID), and ``gpu_stream`` (CUDA stream pointer).
+                        ID), ``gpu_stream`` (CUDA stream pointer) and
+                        ``modeord`` (0: CMCL-compatible mode ordering,
+                        1: FFT-style mode ordering).
     """
 
     def __init__(self, nufft_type, n_modes, n_trans=1, eps=1e-6, isign=None,
diff --git a/python/cufinufft/tests/test_basic.py b/python/cufinufft/tests/test_basic.py
@@ -9,27 +9,29 @@
 # NOTE: Tests below fail for tolerance 1e-4 (error executing plan).
 
 DTYPES = [np.float32, np.float64]
-SHAPES = [(16,), (16, 16), (16, 16, 16)]
+SHAPES = [(16,), (16, 16), (16, 16, 16), (19,), (17, 19), (17, 19, 24)]
 MS = [256, 1024, 4096]
 TOLS = [1e-3, 1e-6]
 OUTPUT_ARGS = [False, True]
 CONTIGUOUS = [False, True]
+MODEORDS = [0, 1]
 
 
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("shape", SHAPES)
 @pytest.mark.parametrize("M", MS)
 @pytest.mark.parametrize("tol", TOLS)
 @pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
-def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg):
+@pytest.mark.parametrize("modeord", MODEORDS)
+def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg, modeord):
     complex_dtype = utils._complex_dtype(dtype)
 
     k, c = utils.type1_problem(dtype, shape, M)
 
     k_gpu = to_gpu(k)
     c_gpu = to_gpu(c)
 
-    plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
+    plan = Plan(1, shape, eps=tol, dtype=complex_dtype, modeord=modeord)
 
     # Since k_gpu is an array of shape (dim, M), this will expand to
     # plan.setpts(k_gpu[0], ..., k_gpu[dim]), allowing us to handle all
@@ -43,6 +45,8 @@ def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg):
         fk_gpu = plan.execute(c_gpu)
 
     fk = to_cpu(fk_gpu)
+    if modeord == 1:
+        fk = np.fft.fftshift(fk)
 
     utils.verify_type1(k, c, fk, tol)
 
@@ -53,12 +57,13 @@ def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg):
 @pytest.mark.parametrize("tol", TOLS)
 @pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
 @pytest.mark.parametrize("contiguous", CONTIGUOUS)
-def test_type2(to_gpu, to_cpu, dtype, shape, M, tol, output_arg, contiguous):
+@pytest.mark.parametrize("modeord", MODEORDS)
+def test_type2(to_gpu, to_cpu, dtype, shape, M, tol, output_arg, contiguous, modeord):
     complex_dtype = utils._complex_dtype(dtype)
 
     k, fk = utils.type2_problem(dtype, shape, M)
 
-    plan = Plan(2, shape, eps=tol, dtype=complex_dtype)
+    plan = Plan(2, shape, eps=tol, dtype=complex_dtype, modeord=modeord)
 
     check_result = True
 
@@ -81,7 +86,12 @@ def _execute(*args, **kwargs):
             return plan.execute(*args, **kwargs)
 
     k_gpu = to_gpu(k)
-    fk_gpu = to_gpu(fk)
+
+    if modeord == 1:
+        _fk = np.fft.ifftshift(fk)
+    else:
+        _fk = fk
+    fk_gpu = to_gpu(_fk)
 
     plan.setpts(*k_gpu)
 
diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu
@@ -59,8 +59,13 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
             return FINUFFT_ERR_CUDA_FAILURE;
 
         // Step 3: deconvolve and shuffle
-        if ((ier = cudeconvolve1d<T>(d_plan, blksize)))
-            return ier;
+        if (d_plan->opts.modeord == 0) {
+            if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize)))
+                return ier;
+        } else {
+            if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize)))
+                return ier;
+        }
     }
 
     return 0;
@@ -95,8 +100,13 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
         d_plan->fk = d_fkstart;
 
         // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-        if ((ier = cudeconvolve1d<T>(d_plan, blksize)))
-            return ier;
+        if (d_plan->opts.modeord == 0) {
+            if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize)))
+                return ier;
+        } else {
+            if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize)))
+                return ier;
+        }
 
         // Step 2: FFT
         cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu
@@ -59,8 +59,13 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
             return FINUFFT_ERR_CUDA_FAILURE;
 
         // Step 3: deconvolve and shuffle
-        if ((ier = cudeconvolve2d<T>(d_plan, blksize)))
-            return ier;
+        if (d_plan->opts.modeord == 0) {
+            if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize)))
+                return ier;
+        } else {
+            if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize)))
+                return ier;
+        }
     }
 
     return 0;
@@ -95,8 +100,13 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
         d_plan->fk = d_fkstart;
 
         // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-        if ((ier = cudeconvolve2d<T>(d_plan, blksize)))
-            return ier;
+        if (d_plan->opts.modeord == 0) {
+            if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize)))
+                return ier;
+        } else {
+            if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize)))
+                return ier;
+        }
 
         // Step 2: FFT
         cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu
@@ -57,8 +57,13 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
             return FINUFFT_ERR_CUDA_FAILURE;
 
         // Step 3: deconvolve and shuffle
-        if ((ier = cudeconvolve3d<T>(d_plan, blksize)))
-            return ier;
+        if (d_plan->opts.modeord == 0) {
+            if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize)))
+                return ier;
+        } else {
+            if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize)))
+                return ier;
+        }
     }
 
     return 0;
@@ -91,8 +96,13 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
         d_plan->fk = d_fkstart;
 
         // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-        if ((ier = cudeconvolve3d<T>(d_plan, blksize)))
-            return ier;
+        if (d_plan->opts.modeord == 0) {
+            if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize)))
+                return ier;
+        } else {
+            if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize)))
+                return ier;
+        }
 
         // Step 2: FFT
         RETURN_IF_CUDA_ERROR
diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu
@@ -121,5 +121,7 @@ void cufinufft_default_opts(cufinufft_opts *opts)
 
     // By default, only use device 0
     opts->gpu_device_id = 0;
+
+    opts->modeord = 0;
 }
 }
diff --git a/src/cuda/deconvolve_wrapper.cu b/src/cuda/deconvolve_wrapper.cu

Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,8 @@ def _get_NufftOpts():`
`63`	`63`	`('gpu_spreadinterponly', c_int),`
`64`	`64`	`('gpu_maxbatchsize', c_int),`
`65`	`65`	`('gpu_device_id', c_int),`
`66`		`- ('gpu_stream', c_void_p)`
	`66`	`+ ('gpu_stream', c_void_p),`
	`67`	`+ ('modeord', c_int)`
`67`	`68`	`]`
`68`	`69`	`return fields`
`69`	`70`
Original file line number	Diff line number	Diff line change
`@@ -121,5 +121,7 @@ void cufinufft_default_opts(cufinufft_opts *opts)`
`121`	`121`
`122`	`122`	`// By default, only use device 0`
`123`	`123`	`opts->gpu_device_id = 0;`
	`124`	`+`
	`125`	`+ opts->modeord = 0;`
`124`	`126`	`}`
`125`	`127`	`}`