Use unified padded length in kernel instead of array

avitase · avitase · commit a539792bc510 · 2025-07-13T20:34:20.000+02:00
Previously, the CUDA kernel was incorrectly passed an array of per-trajectory
lengths (`P`), even though all trajectories in a batch were padded to a common
length. This caused the kernel to terminate loops prematurely based on the
individual lengths, though it did not affect the computed Frechet distance due
to padding with repeated points.

This commit fixes the issue by replacing the per-trajectory length array with a
single integer `P` representing the unified (padded) length. The kernel
interface and loop logic are adjusted accordingly.

This is a minor internal fix with no observable change in output.
diff --git a/ffrechet-cuda/source/ffrechet-cuda.cu b/ffrechet-cuda/source/ffrechet-cuda.cu
@@ -10,7 +10,7 @@ __device__ static float metric(float dx, float dy)
 
 __global__ static void kernel(const float* px,
                               const float* py,
-                              const unsigned* P,
+                              const unsigned P,
                               const unsigned N,
                               const float* qx,
                               const float* qy,
@@ -27,7 +27,7 @@ __global__ static void kernel(const float* px,
             buffer[j * N + idx] = acc;
         }
 
-        for (unsigned i = 1; i < P[idx]; i++)
+        for (unsigned i = 1; i < P; i++)
         {
             for (unsigned j = Q - 1; j > 0; j--)
             {
@@ -101,10 +101,6 @@ void cuda_frechet_distance(const float* const* px,
     cuda_check(
         cudaMemcpy(py_d, py_flat.data(), py_flat.size() * sizeof(float), cudaMemcpyHostToDevice));
 
-    unsigned* P_d;
-    cuda_check(cudaMalloc(&P_d, N * sizeof(unsigned)));
-    cuda_check(cudaMemcpy(P_d, P, N * sizeof(unsigned), cudaMemcpyHostToDevice));
-
     float* qx_d;
     cuda_check(cudaMalloc(&qx_d, Q * sizeof(float)));
     cuda_check(cudaMemcpy(qx_d, qx, Q * sizeof(float), cudaMemcpyHostToDevice));
@@ -119,7 +115,7 @@ void cuda_frechet_distance(const float* const* px,
     float* res_d;
     cuda_check(cudaMalloc(&res_d, N * sizeof(float)));
 
-    kernel<<<cfg.grid_size, cfg.block_size>>>(px_d, py_d, P_d, N, qx_d, qy_d, Q, buffer_d, res_d);
+    kernel<<<cfg.grid_size, cfg.block_size>>>(px_d, py_d, P_MAX, N, qx_d, qy_d, Q, buffer_d, res_d);
     cuda_check();
     cuda_check(cudaDeviceSynchronize());