Skip to content

Commit e5f3d56

Browse files
committed
add loop unroll
1 parent 483f2ef commit e5f3d56

File tree

2 files changed

+8
-0
lines changed

2 files changed

+8
-0
lines changed

tsne/CUDA/src/kernels/nbodyfft.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,7 @@ void DFT2D1gpu(float* din, thrust::complex<float>* dout, int num_rows, int num_c
300300
thrust::complex<float> sum, twiddle;
301301
angle = -2.0f * PI * fdividef((float)i, (float)num_cols);
302302
sum = 0.0f;
303+
#pragma unroll
303304
for (int k = 0; k < num_cols; ++k) {
304305
// sincosf(angle * k, &sinf, &cosf);
305306
// twiddle = thrust::complex<float>(cosf, sinf);
@@ -324,6 +325,7 @@ void DFT2D2gpu(thrust::complex<float>* din, thrust::complex<float>* dout, int nu
324325
thrust::complex<float> sum, twiddle;
325326
angle = -2.0f * PI * fdividef((float)i, (float)num_cols);
326327
sum = 0.0f;
328+
#pragma unroll
327329
for (int k = 0; k < num_cols; ++k) {
328330
// sincosf(angle * k, &sinf, &cosf);
329331
// twiddle = thrust::complex<float>(cosf, sinf);
@@ -348,6 +350,7 @@ void iDFT2D1gpu(thrust::complex<float>* din, thrust::complex<float>* dout, int n
348350
thrust::complex<float> sum, twiddle;
349351
angle = 2.0f * PI * fdividef((float)i, (float)num_cols);
350352
sum = 0.0f;
353+
#pragma unroll
351354
for (int k = 0; k < num_cols; ++k) {
352355
// sincosf(angle * k, &sinf, &cosf);
353356
// twiddle = thrust::complex<float>(cosf, sinf);
@@ -376,6 +379,7 @@ void iDFT2D2gpu(thrust::complex<float>* din, float* dout, int num_rows, int num_
376379
thrust::complex<float> twiddle;
377380
angle = 2.0f * PI * fdividef((float)i, (float)num_cols);
378381
sum = 0.0f;
382+
#pragma unroll
379383
for (int k = 0; k < num_cols; ++k) {
380384
// sincosf(angle * k, &sinf, &cosf);
381385
// twiddle = thrust::complex<float>(cosf, sinf);

tsne/HIP/src/kernels/nbodyfft.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,7 @@ void DFT2D1gpu(float* din, thrust::complex<float>* dout, int num_rows, int num_c
309309
thrust::complex<float> sum, twiddle;
310310
angle = -2.0f * PI * fdividef((float)i, (float)num_cols);
311311
sum = 0.0f;
312+
#pragma unroll
312313
for (int k = 0; k < num_cols; ++k) {
313314
// sincosf(angle * k, &sinf, &cosf);
314315
// twiddle = thrust::complex<float>(cosf, sinf);
@@ -333,6 +334,7 @@ void DFT2D2gpu(thrust::complex<float>* din, thrust::complex<float>* dout, int nu
333334
thrust::complex<float> sum, twiddle;
334335
angle = -2.0f * PI * fdividef((float)i, (float)num_cols);
335336
sum = 0.0f;
337+
#pragma unroll
336338
for (int k = 0; k < num_cols; ++k) {
337339
// sincosf(angle * k, &sinf, &cosf);
338340
// twiddle = thrust::complex<float>(cosf, sinf);
@@ -357,6 +359,7 @@ void iDFT2D1gpu(thrust::complex<float>* din, thrust::complex<float>* dout, int n
357359
thrust::complex<float> sum, twiddle;
358360
angle = 2.0f * PI * fdividef((float)i, (float)num_cols);
359361
sum = 0.0f;
362+
#pragma unroll
360363
for (int k = 0; k < num_cols; ++k) {
361364
// sincosf(angle * k, &sinf, &cosf);
362365
// twiddle = thrust::complex<float>(cosf, sinf);
@@ -385,6 +388,7 @@ void iDFT2D2gpu(thrust::complex<float>* din, float* dout, int num_rows, int num_
385388
thrust::complex<float> twiddle;
386389
angle = 2.0f * PI * fdividef((float)i, (float)num_cols);
387390
sum = 0.0f;
391+
#pragma unroll
388392
for (int k = 0; k < num_cols; ++k) {
389393
// sincosf(angle * k, &sinf, &cosf);
390394
// twiddle = thrust::complex<float>(cosf, sinf);

0 commit comments

Comments
 (0)