fix incompatibilities with new cupy version (13.5) (#118)

gschramm · web-flow · commit 3f651e2a2988 · 2025-08-20T20:16:34.000+02:00
* fix test failures

* fix test failures

* fix test failures

* remove cupy&lt;=13.4 in install instructions

* test with array-api-strict=~2.0
diff --git a/.github/workflows/c_python_build.yml b/.github/workflows/c_python_build.yml
@@ -86,7 +86,7 @@ jobs:
     - name: Install tests dependencies
       run: |
         pip install pytest pytest-cov
-        pip install array-api-strict~=1.0
+        pip install array-api-strict~=2.0
 
     - if: matrix.os == 'ubuntu-latest'
       name: Run Tests
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
@@ -40,13 +40,13 @@ To install parallelproj and cupy (optional and only if you have a CUDA GPU) from
 
         .. code-block:: console
         
-           $ mamba install parallelproj cupy<=13.4
+           $ mamba install parallelproj cupy
 
     .. tab-item:: conda
 
         .. code-block:: console
         
-           $ conda install -c conda-forge parallelproj cupy<=13.4
+           $ conda install -c conda-forge parallelproj cupy
 
 .. note::
    On conda-forge, CPU, CUDA 11 and CUDA 12 builds of `libparallelproj` are availalbe.
@@ -65,7 +65,7 @@ To install parallelproj and pytorch (optional) from conda-forge, run
 
         .. code-block:: console
         
-           $ mamba install parallelproj pytorch cupy<=13.4
+           $ mamba install parallelproj pytorch cupy
 
     .. tab-item:: mamba, pytorch without CUDA support
 
@@ -77,7 +77,7 @@ To install parallelproj and pytorch (optional) from conda-forge, run
 
         .. code-block:: console
         
-           $ conda install -c conda-forge parallelproj pytorch cupy<=13.4
+           $ conda install -c conda-forge parallelproj pytorch cupy
 
     .. tab-item:: conda pytorch without CUDA support
 
diff --git a/src/parallelproj/backend.py b/src/parallelproj/backend.py
@@ -327,6 +327,8 @@
 
     # ---------------------------------------------------------------------------------------
     if cupy_enabled:
+        import cupy as cp
+
         if "PARALLELPROJ_CUDA_KERNEL_FILE" in os.environ:
             cuda_kernel_file = Path(os.environ["PARALLELPROJ_CUDA_KERNEL_FILE"])
         else:
@@ -536,7 +538,9 @@ def joseph3d_fwd(
                 np.asarray(img.shape, dtype=np.int32),
             )
 
-    return xp.asarray(img_fwd, device=array_api_compat.device(img))
+    return array_api_compat.to_device(
+        xp.from_dlpack(img_fwd), array_api_compat.device(img)
+    )
 
 
 def joseph3d_back(
@@ -651,7 +655,9 @@ def joseph3d_back(
                 np.asarray(back_img.shape, dtype=np.int32),
             )
 
-    return xp.asarray(back_img, device=array_api_compat.device(img_fwd))
+    return array_api_compat.to_device(
+        xp.from_dlpack(back_img), array_api_compat.device(img_fwd)
+    )
 
 
 def joseph3d_fwd_tof_sino(
@@ -816,7 +822,9 @@ def joseph3d_fwd_tof_sino(
                 lor_dependent_tofcenter_offset,
             )
 
-    return xp.asarray(img_fwd, device=array_api_compat.device(img))
+    return array_api_compat.to_device(
+        xp.from_dlpack(img_fwd), array_api_compat.device(img)
+    )
 
 
 def joseph3d_back_tof_sino(
@@ -995,7 +1003,9 @@ def joseph3d_back_tof_sino(
                 lor_dependent_tofcenter_offset,
             )
 
-    return xp.asarray(back_img, device=array_api_compat.device(img_fwd))
+    return array_api_compat.to_device(
+        xp.from_dlpack(back_img), array_api_compat.device(img_fwd)
+    )
 
 
 def joseph3d_fwd_tof_lm(
@@ -1163,7 +1173,9 @@ def joseph3d_fwd_tof_lm(
                 lor_dependent_tofcenter_offset,
             )
 
-    return xp.asarray(img_fwd, device=array_api_compat.device(img))
+    return array_api_compat.to_device(
+        xp.from_dlpack(img_fwd), array_api_compat.device(img)
+    )
 
 
 def joseph3d_back_tof_lm(
@@ -1343,7 +1355,9 @@ def joseph3d_back_tof_lm(
                 lor_dependent_tofcenter_offset,
             )
 
-    return xp.asarray(back_img, device=array_api_compat.device(img_fwd))
+    return array_api_compat.to_device(
+        xp.from_dlpack(back_img), array_api_compat.device(img_fwd)
+    )
 
 
 if cupy_enabled:
@@ -1439,7 +1453,7 @@ def count_event_multiplicity(events: Array) -> Array:
     else:
         tmp = np.unique(events, axis=0, return_counts=True, return_inverse=True)
 
-    mu = xp.asarray(tmp[2][tmp[1]], device=dev)
+    mu = array_api_compat.to_device(xp.from_dlpack(tmp[2][tmp[1]]), dev)
     mu = xp.reshape(mu, (array_api_compat.size(mu),))
 
     return mu
diff --git a/src/parallelproj/operators.py b/src/parallelproj/operators.py
@@ -479,9 +479,11 @@ def _apply(self, x: Array) -> Array:
             else:
                 sigma = self._sigma
 
-            return xp.asarray(
-                ndimagex.gaussian_filter(cp.asarray(x), sigma=sigma, **self._kwargs),
-                device=device(x),
+            return array_api_compat.to_device(
+                xp.from_dlpack(
+                    ndimagex.gaussian_filter(cp.asarray(x), sigma=sigma, **self._kwargs)
+                ),
+                device(x),
             )
         else:
             import scipy.ndimage as ndimage
@@ -491,9 +493,11 @@ def _apply(self, x: Array) -> Array:
             else:
                 sigma = self._sigma
 
-            return xp.asarray(
-                ndimage.gaussian_filter(np.asarray(x), sigma=sigma, **self._kwargs),
-                device=device(x),
+            return array_api_compat.to_device(
+                xp.from_dlpack(
+                    ndimage.gaussian_filter(np.asarray(x), sigma=sigma, **self._kwargs)
+                ),
+                device(x),
             )
 
     def _adjoint(self, y: Array) -> Array:
diff --git a/src/parallelproj/pet_lors.py b/src/parallelproj/pet_lors.py
@@ -179,8 +179,8 @@ def get_lor_coordinates(
         )
 
         for i, block_pair_num in enumerate(block_pair_nums):
-            bs = self._all_block_pairs[block_pair_num, 0]
-            be = self._all_block_pairs[block_pair_num, 1]
+            bs = int(self._all_block_pairs[block_pair_num, 0])
+            be = int(self._all_block_pairs[block_pair_num, 1])
 
             eps = self.scanner.get_lor_endpoints(
                 self.xp.asarray([bs], device=self.dev),
@@ -408,14 +408,22 @@ def _setup_view_indices(self) -> None:
         )
 
         for view in np.arange(self._num_views):
-            self._start_in_ring_index[view, :] = (
-                self.xp.concat((self.xp.arange(m) // 2, self.xp.asarray([n // 2])))
-                - view
-            )[self._radial_trim : -self._radial_trim]
-            self._end_in_ring_index[view, :] = (
-                self.xp.concat((self.xp.asarray([-1]), -((self.xp.arange(m) + 4) // 2)))
-                - view
-            )[self._radial_trim : -self._radial_trim]
+            self._start_in_ring_index[view, :] = self.xp.astype(
+                (
+                    self.xp.concat((self.xp.arange(m) // 2, self.xp.asarray([n // 2])))
+                    - int(view)
+                )[self._radial_trim : -self._radial_trim],
+                self.xp.int32,
+            )
+            self._end_in_ring_index[view, :] = self.xp.astype(
+                (
+                    self.xp.concat(
+                        (self.xp.asarray([-1]), -((self.xp.arange(m) + 4) // 2))
+                    )
+                    - int(view)
+                )[self._radial_trim : -self._radial_trim],
+                self.xp.int32,
+            )
 
         # shift the negative indices
         self._start_in_ring_index = self.xp.where(
diff --git a/src/parallelproj/pet_scanners.py b/src/parallelproj/pet_scanners.py
@@ -5,6 +5,8 @@
 import abc
 from parallelproj import Array
 import matplotlib.pyplot as plt
+import numpy as np
+import array_api_compat
 
 from types import ModuleType
 from array_api_compat import size
@@ -226,27 +228,21 @@ def __init__(
         self._spacing = spacing
 
         # calculate the LOR endpoints
-        x0 = spacing[0] * (
-            xp.arange(shape[0], device=dev, dtype=xp.float32) - (shape[0] - 1) / 2
-        )
-        x1 = spacing[1] * (
-            xp.arange(shape[1], device=dev, dtype=xp.float32) - (shape[1] - 1) / 2
-        )
-        x2 = spacing[2] * (
-            xp.arange(shape[2], device=dev, dtype=xp.float32) - (shape[2] - 1) / 2
-        )
+        x0 = spacing[0] * (np.arange(shape[0], dtype=np.float32) - (shape[0] - 1) / 2)
+        x1 = spacing[1] * (np.arange(shape[1], dtype=np.float32) - (shape[1] - 1) / 2)
+        x2 = spacing[2] * (np.arange(shape[2], dtype=np.float32) - (shape[2] - 1) / 2)
 
-        X0, X1, X2 = xp.meshgrid(x0, x1, x2, indexing="ij")
+        # in the current version (1.12.0) of array_api_compat.torch the indexing kwargs is ignored
+        # which is why we stick to numpy
+        X0, X1, X2 = np.meshgrid(x0, x1, x2, indexing="ij")
 
-        self._lor_endpoints = xp.stack(
-            (
-                xp.reshape(X0, (-1,)),
-                xp.reshape(X1, (-1,)),
-                xp.reshape(X2, (-1,)),
-            ),
+        self._lor_endpoints = np.stack(
+            (X0.ravel(), X1.ravel(), X2.ravel()),
             axis=-1,
         )
 
+        self._lor_endpoints = xp.asarray(self._lor_endpoints, device=dev)
+
         if affine_transformation_matrix is not None:
             tmp = xp.ones((self._lor_endpoints.shape[0], 4), device=dev)
             tmp[:, :-1] = self._lor_endpoints
@@ -500,7 +496,7 @@ def setup_all_lor_endpoints(self) -> None:
                     self._all_lor_endpoints_index_offset[i] + module.num_lor_endpoints
                 ),
                 :,
-            ] = module.get_lor_endpoints()
+            ] = self.xp.astype(module.get_lor_endpoints(), self.xp.float32)
 
         self._all_lor_endpoints_module_number = [
             int(self._num_lor_endpoints_per_module[i]) * [i]
diff --git a/tests/test_nontof_joseph.py b/tests/test_nontof_joseph.py
@@ -136,18 +136,18 @@ def test_adjointness(
     # generate random LORs on a sphere around the image volume
     R = 0.8 * xp.max((xp.asarray(img_dim, dtype=xp.float32, device=dev) * voxel_size))
 
-    phis = xp.asarray(np.random.rand(nLORs) * 2 * np.pi, device=dev)
-    costheta = xp.asarray(np.random.rand(nLORs) * 2 - 1, device=dev)
-    sintheta = xp.sqrt(1 - costheta ** 2)
+    phis = xp.asarray(np.random.rand(nLORs) * 2 * np.pi, device=dev, dtype=xp.float32)
+    costheta = xp.asarray(np.random.rand(nLORs) * 2 - 1, device=dev, dtype=xp.float32)
+    sintheta = xp.astype(xp.sqrt(1 - costheta**2), xp.float32)
 
     xstart = xp.zeros((nLORs, 3), dtype=xp.float32, device=dev)
     xstart[:, 0] = R * sintheta * xp.cos(phis)
     xstart[:, 1] = R * sintheta * xp.sin(phis)
     xstart[:, 2] = R * costheta
 
-    phis = xp.asarray(np.random.rand(nLORs) * 2 * np.pi, device=dev)
-    costheta = xp.asarray(np.random.rand(nLORs) * 2 - 1, device=dev)
-    sintheta = xp.sqrt(1 - costheta ** 2)
+    phis = xp.asarray(np.random.rand(nLORs) * 2 * np.pi, device=dev, dtype=xp.float32)
+    costheta = xp.asarray(np.random.rand(nLORs) * 2 - 1, device=dev, dtype=xp.float32)
+    sintheta = xp.astype(xp.sqrt(1 - costheta**2), xp.float32)
 
     xend = xp.zeros((nLORs, 3), dtype=xp.float32, device=dev)
     xend[:, 0] = R * sintheta * xp.cos(phis)
diff --git a/tests/test_toflm_joseph.py b/tests/test_toflm_joseph.py
@@ -57,7 +57,7 @@ def test_tof_lm_fwd(
                 num_tofbins = max(4 * int(n * vsize / delta / 2) + 1, 11)
 
                 trunc_factor = 1.0 / erf(ns / math.sqrt(2))
-                trunc_dist = ns * math.sqrt(sig_t ** 2 + (delta ** 2) / 12)
+                trunc_dist = ns * math.sqrt(sig_t**2 + (delta**2) / 12)
 
                 for i in range(num_tofbins // 2):
                     p_tof = parallelproj.joseph3d_fwd_tof_lm(
@@ -92,10 +92,10 @@ def test_tof_lm_fwd(
                                 sig_t,
                                 i,
                                 theory_value,
-                                float(p_tof[0] - theory_value),
+                                float(p_tof[0]) - theory_value,
                             )
 
-                        abs_diff = abs(p_tof[0] - theory_value)
+                        abs_diff = abs(float(p_tof[0]) - theory_value)
                         assert abs_diff < atol
 
                         rel_diff = abs_diff / theory_value
@@ -156,18 +156,18 @@ def test_adjointness(
     # generate random LORs on a sphere around the image volume
     R = 0.8 * xp.max((xp.asarray(img_dim, dtype=xp.float32, device=dev) * voxel_size))
 
-    phis = xp.asarray(np.random.rand(nLORs) * 2 * np.pi, device=dev)
-    costheta = xp.asarray(np.random.rand(nLORs) * 2 - 1, device=dev)
-    sintheta = xp.sqrt(1 - costheta ** 2)
+    phis = xp.asarray(np.random.rand(nLORs) * 2 * np.pi, device=dev, dtype=xp.float32)
+    costheta = xp.asarray(np.random.rand(nLORs) * 2 - 1, device=dev, dtype=xp.float32)
+    sintheta = xp.sqrt(1 - costheta**2)
 
     xstart = xp.zeros((nLORs, 3), dtype=xp.float32, device=dev)
     xstart[:, 0] = R * sintheta * xp.cos(phis)
     xstart[:, 1] = R * sintheta * xp.sin(phis)
     xstart[:, 2] = R * costheta
 
-    phis = xp.asarray(np.random.rand(nLORs) * 2 * np.pi, device=dev)
-    costheta = xp.asarray(np.random.rand(nLORs) * 2 - 1, device=dev)
-    sintheta = xp.sqrt(1 - costheta ** 2)
+    phis = xp.asarray(np.random.rand(nLORs) * 2 * np.pi, device=dev, dtype=xp.float32)
+    costheta = xp.asarray(np.random.rand(nLORs) * 2 - 1, device=dev, dtype=xp.float32)
+    sintheta = xp.sqrt(1 - costheta**2)
 
     xend = xp.zeros((nLORs, 3), dtype=xp.float32, device=dev)
     xend[:, 0] = R * sintheta * xp.cos(phis)
diff --git a/tests/test_tofsino_joseph.py b/tests/test_tofsino_joseph.py
@@ -73,7 +73,7 @@ def test_tof_sino_fwd(
                 )
 
                 trunc_factor = 1.0 / erf(ns / math.sqrt(2))
-                trunc_dist = ns * math.sqrt(sig_t ** 2 + (delta ** 2) / 12)
+                trunc_dist = ns * math.sqrt(sig_t**2 + (delta**2) / 12)
 
                 for i in range(num_tofbins // 2):
 
@@ -99,10 +99,12 @@ def test_tof_sino_fwd(
                                 sig_t,
                                 i,
                                 theory_value,
-                                float(p_tof[0, num_tofbins // 2 + i] - theory_value),
+                                float(p_tof[0, num_tofbins // 2 + i]) - theory_value,
                             )
 
-                        abs_diff = abs(p_tof[0, num_tofbins // 2 + i] - theory_value)
+                        abs_diff = abs(
+                            float(p_tof[0, num_tofbins // 2 + i]) - theory_value
+                        )
                         assert abs_diff < atol
 
                         rel_diff = abs_diff / theory_value
@@ -133,18 +135,18 @@ def test_adjointness(
     # generate random LORs on a sphere around the image volume
     R = 0.8 * xp.max((xp.asarray(img_dim, dtype=xp.float32, device=dev) * voxel_size))
 
-    phis = xp.asarray(np.random.rand(nLORs) * 2 * np.pi, device=dev)
-    costheta = xp.asarray(np.random.rand(nLORs) * 2 - 1, device=dev)
-    sintheta = xp.sqrt(1 - costheta ** 2)
+    phis = xp.asarray(np.random.rand(nLORs) * 2 * np.pi, device=dev, dtype=xp.float32)
+    costheta = xp.asarray(np.random.rand(nLORs) * 2 - 1, device=dev, dtype=xp.float32)
+    sintheta = xp.sqrt(1 - costheta**2)
 
     xstart = xp.zeros((nLORs, 3), dtype=xp.float32, device=dev)
     xstart[:, 0] = R * sintheta * xp.cos(phis)
     xstart[:, 1] = R * sintheta * xp.sin(phis)
     xstart[:, 2] = R * costheta
 
-    phis = xp.asarray(np.random.rand(nLORs) * 2 * np.pi, device=dev)
-    costheta = xp.asarray(np.random.rand(nLORs) * 2 - 1, device=dev)
-    sintheta = xp.sqrt(1 - costheta ** 2)
+    phis = xp.asarray(np.random.rand(nLORs) * 2 * np.pi, device=dev, dtype=xp.float32)
+    costheta = xp.asarray(np.random.rand(nLORs) * 2 - 1, device=dev, dtype=xp.float32)
+    sintheta = xp.sqrt(1 - costheta**2)
 
     xend = xp.zeros((nLORs, 3), dtype=xp.float32, device=dev)
     xend[:, 0] = R * sintheta * xp.cos(phis)
@@ -220,7 +222,7 @@ def test_tof_sino_fwd_sum(
     n = 101
     vsize = 1.0
     voxsize = xp.asarray([vsize, vsize, vsize], dtype=xp.float32, device=dev)
-    tmp = (-0.5 * n + 0.5) * voxsize[0]
+    tmp = float((-0.5 * n + 0.5) * voxsize[0])
     img_origin = xp.asarray([tmp, tmp, tmp], dtype=xp.float32, device=dev)
     num_off = 30