Skip to content

Commit a539792

Browse files
committed
Use unified padded length in kernel instead of array
Previously, the CUDA kernel was incorrectly passed an array of per-trajectory lengths (`P`), even though all trajectories in a batch were padded to a common length. This caused the kernel to terminate loops prematurely based on the individual lengths, though it did not affect the computed Frechet distance due to padding with repeated points. This commit fixes the issue by replacing the per-trajectory length array with a single integer `P` representing the unified (padded) length. The kernel interface and loop logic are adjusted accordingly. This is a minor internal fix with no observable change in output.
1 parent fe776bb commit a539792

File tree

1 file changed

+3
-7
lines changed

1 file changed

+3
-7
lines changed

ffrechet-cuda/source/ffrechet-cuda.cu

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ __device__ static float metric(float dx, float dy)
1010

1111
__global__ static void kernel(const float* px,
1212
const float* py,
13-
const unsigned* P,
13+
const unsigned P,
1414
const unsigned N,
1515
const float* qx,
1616
const float* qy,
@@ -27,7 +27,7 @@ __global__ static void kernel(const float* px,
2727
buffer[j * N + idx] = acc;
2828
}
2929

30-
for (unsigned i = 1; i < P[idx]; i++)
30+
for (unsigned i = 1; i < P; i++)
3131
{
3232
for (unsigned j = Q - 1; j > 0; j--)
3333
{
@@ -101,10 +101,6 @@ void cuda_frechet_distance(const float* const* px,
101101
cuda_check(
102102
cudaMemcpy(py_d, py_flat.data(), py_flat.size() * sizeof(float), cudaMemcpyHostToDevice));
103103

104-
unsigned* P_d;
105-
cuda_check(cudaMalloc(&P_d, N * sizeof(unsigned)));
106-
cuda_check(cudaMemcpy(P_d, P, N * sizeof(unsigned), cudaMemcpyHostToDevice));
107-
108104
float* qx_d;
109105
cuda_check(cudaMalloc(&qx_d, Q * sizeof(float)));
110106
cuda_check(cudaMemcpy(qx_d, qx, Q * sizeof(float), cudaMemcpyHostToDevice));
@@ -119,7 +115,7 @@ void cuda_frechet_distance(const float* const* px,
119115
float* res_d;
120116
cuda_check(cudaMalloc(&res_d, N * sizeof(float)));
121117

122-
kernel<<<cfg.grid_size, cfg.block_size>>>(px_d, py_d, P_d, N, qx_d, qy_d, Q, buffer_d, res_d);
118+
kernel<<<cfg.grid_size, cfg.block_size>>>(px_d, py_d, P_MAX, N, qx_d, qy_d, Q, buffer_d, res_d);
123119
cuda_check();
124120
cuda_check(cudaDeviceSynchronize());
125121

0 commit comments

Comments
 (0)