Skip to content

Commit f843736

Browse files
committed
use cufft
1 parent 13eff2e commit f843736

File tree

7 files changed

+76
-55
lines changed

7 files changed

+76
-55
lines changed

tsne/CUDA/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ set(SOURCES
7878
${CMAKE_SOURCE_DIR}/src/utils/math_utils.cu
7979
${CMAKE_SOURCE_DIR}/src/utils/matrix_broadcast_utils.cu
8080
${CMAKE_SOURCE_DIR}/src/utils/reduce_utils.cu
81+
${CMAKE_SOURCE_DIR}/../data/verify.cpp
8182

8283
# Kernels
8384
${CMAKE_SOURCE_DIR}/src/kernels/apply_forces.cu
@@ -95,6 +96,7 @@ set(SOURCES
9596
include_directories(
9697
${CMAKE_SOURCE_DIR}/src
9798
${CMAKE_SOURCE_DIR}/src/include
99+
${CMAKE_SOURCE_DIR}/../data
98100
${CUDA_INCLUDE_DIRS}
99101
)
100102

tsne/CUDA/src/exe/main.cu

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ int main(int argc, char** argv)
6363
std::chrono::steady_clock::time_point time_end;
6464
double time_total = 0.0;
6565
double time_total_ = 0.0;
66+
int success = 99;
6667

6768
TIMER_START()
6869

@@ -125,7 +126,12 @@ int main(int argc, char** argv)
125126
}
126127

127128
// Do the t-SNE
128-
time_total_ = tsnecuda::RunTsne(opt);
129+
time_total_ = tsnecuda::RunTsne(opt, success);
130+
if (success == 0) {
131+
std::cout << "Verification SUCCESSFUL\n";
132+
} else {
133+
std::cout << "Verification FAILED\n";
134+
}
129135
std::cout << "\nDone!\n";
130136
} catch (std::exception const& e) {
131137
std::cout << "Exception: " << e.what() << "\n";
@@ -134,5 +140,5 @@ int main(int argc, char** argv)
134140
TIMER_END()
135141
TIMER_PRINT("tsne - total time for whole calculation")
136142

137-
return 0;
143+
return success;
138144
}

tsne/CUDA/src/fit_tsne.cu

Lines changed: 42 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434

3535
#include <chrono>
3636
#include "include/fit_tsne.h"
37+
#include "verify.hpp"
3738

3839
// #ifndef DEBUG_TIME
3940
// #define DEBUG_TIME
@@ -62,7 +63,7 @@
6263
#define PRINT_IL_TIMER(x) std::cout << #x << ": " << ((float)x.count()) / 1000000.0 << "s" << std::endl
6364
#endif
6465

65-
double tsnecuda::RunTsne(tsnecuda::Options& opt)
66+
double tsnecuda::RunTsne(tsnecuda::Options& opt, int& success)
6667
{
6768
std::chrono::steady_clock::time_point time_start_;
6869
std::chrono::steady_clock::time_point time_end_;
@@ -406,8 +407,9 @@ double tsnecuda::RunTsne(tsnecuda::Options& opt)
406407
std::cout << "done." << std::endl;
407408
}
408409

409-
// int fft_dimensions[2] = {n_fft_coeffs, n_fft_coeffs}; // {780, 780}
410-
// size_t work_size, work_size_dft, work_size_idft;
410+
int fft_dimensions[2] = {n_fft_coeffs, n_fft_coeffs}; // {780, 780}
411+
size_t work_size_idft, work_size_dft;
412+
// size_t work_size;
411413

412414
// std::cout << "Setting up dft plans...\n";
413415
// // *** TIMED SEPARATELY. NOT ADDED TO PERF TIME ***
@@ -424,41 +426,41 @@ double tsnecuda::RunTsne(tsnecuda::Options& opt)
424426
// TIME_SINCE(time_start);
425427

426428
// TIME_START();
427-
// cufftHandle plan_dft;
428-
// CufftSafeCall(cufftCreate(&plan_dft));
429-
// CufftSafeCall(cufftMakePlanMany(
430-
// plan_dft,
431-
// 2,
432-
// fft_dimensions,
433-
// NULL,
434-
// 1,
435-
// n_fft_coeffs * n_fft_coeffs,
436-
// NULL,
437-
// 1,
438-
// n_fft_coeffs * (n_fft_coeffs / 2 + 1),
439-
// CUFFT_R2C,
440-
// n_terms,
441-
// &work_size_dft)
442-
// );
429+
cufftHandle plan_dft;
430+
CufftSafeCall(cufftCreate(&plan_dft));
431+
CufftSafeCall(cufftMakePlanMany(
432+
plan_dft,
433+
2,
434+
fft_dimensions,
435+
NULL,
436+
1,
437+
n_fft_coeffs * n_fft_coeffs,
438+
NULL,
439+
1,
440+
n_fft_coeffs * (n_fft_coeffs / 2 + 1),
441+
CUFFT_R2C,
442+
n_terms,
443+
&work_size_dft)
444+
);
443445
// TIME_SINCE(time_start);
444446

445447
// TIME_START();
446-
// cufftHandle plan_idft;
447-
// CufftSafeCall(cufftCreate(&plan_idft));
448-
// CufftSafeCall(cufftMakePlanMany(
449-
// plan_idft,
450-
// 2,
451-
// fft_dimensions,
452-
// NULL,
453-
// 1,
454-
// n_fft_coeffs * (n_fft_coeffs / 2 + 1),
455-
// NULL,
456-
// 1,
457-
// n_fft_coeffs * n_fft_coeffs,
458-
// CUFFT_C2R,
459-
// n_terms,
460-
// &work_size_idft)
461-
// );
448+
cufftHandle plan_idft;
449+
CufftSafeCall(cufftCreate(&plan_idft));
450+
CufftSafeCall(cufftMakePlanMany(
451+
plan_idft,
452+
2,
453+
fft_dimensions,
454+
NULL,
455+
1,
456+
n_fft_coeffs * (n_fft_coeffs / 2 + 1),
457+
NULL,
458+
1,
459+
n_fft_coeffs * n_fft_coeffs,
460+
CUFFT_C2R,
461+
n_terms,
462+
&work_size_idft)
463+
);
462464
// TIME_SINCE(time_start);
463465
// std::cout << "done.\n";
464466

@@ -545,8 +547,8 @@ double tsnecuda::RunTsne(tsnecuda::Options& opt)
545547
#endif
546548

547549
tsnecuda::NbodyFFT2D(
548-
// plan_dft,
549-
// plan_idft,
550+
plan_dft,
551+
plan_idft,
550552
fft_kernel_tilde_device, // input
551553
fft_w_coefficients, // intermediate value
552554
N,
@@ -697,6 +699,9 @@ double tsnecuda::RunTsne(tsnecuda::Options& opt)
697699
dump_file << host_ys[i] << " " << host_ys[i + num_points] << std::endl;
698700
}
699701
dump_file.close();
702+
703+
std::string golden_file = "../../data/tsne_mnist_output_golden.txt";
704+
success = verify(golden_file, opt.get_dump_file(), 0.2, 10.0);
700705
TIMER_END_()
701706

702707
host_ys.clear();

tsne/CUDA/src/include/fit_tsne.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
#include "include/kernels/rep_forces.h"
5757

5858
namespace tsnecuda {
59-
double RunTsne(tsnecuda::Options& opt);
59+
double RunTsne(tsnecuda::Options& opt, int& success);
6060
}
6161

6262
#endif

tsne/CUDA/src/include/kernels/nbodyfft.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ void PrecomputeFFT2D(
5959
thrust::device_vector<thrust::complex<float>>& fft_scratchpad_device, double& duration); // added
6060

6161
void NbodyFFT2D(
62-
// cufftHandle& plan_dft,
63-
// cufftHandle& plan_idft,
62+
cufftHandle& plan_dft,
63+
cufftHandle& plan_idft,
6464
thrust::device_vector<thrust::complex<float>>& fft_kernel_tilde_device,
6565
thrust::device_vector<thrust::complex<float>>& fft_w_coefficients,
6666
int N,

tsne/CUDA/src/include/utils/thrust_transform_functions.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ struct FunctionalEntropy {
4747
__host__ __device__
4848
float operator()(const float& x) const {
4949
float val = x * log(x);
50-
return (val != val || isinf(val)) ? 0 : val;
50+
return (x == 0 || val != val || isinf(val)) ? 0 : val;
5151
}
5252
};
5353

tsne/CUDA/src/kernels/nbodyfft.cu

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -477,8 +477,8 @@ void tsnecuda::PrecomputeFFT2D(
477477
}
478478

479479
void tsnecuda::NbodyFFT2D(
480-
// cufftHandle& plan_dft,
481-
// cufftHandle& plan_idft,
480+
cufftHandle& plan_dft,
481+
cufftHandle& plan_idft,
482482
thrust::device_vector<thrust::complex<float>>& fft_kernel_tilde_device,
483483
thrust::device_vector<thrust::complex<float>>& fft_w_coefficients,
484484
int N,
@@ -592,13 +592,15 @@ void tsnecuda::NbodyFFT2D(
592592
);
593593
CUDA_CHECK_LAST_ERROR()
594594
GpuErrorCheck(cudaDeviceSynchronize());
595+
#define USE_CUFFT
595596

596-
// // Compute fft values at interpolated nodes
597-
// cufftExecR2C(plan_dft,
598-
// reinterpret_cast<cufftReal *>(thrust::raw_pointer_cast(fft_input.data())),
599-
// reinterpret_cast<cufftComplex *>(thrust::raw_pointer_cast(fft_w_coefficients.data())));
600-
// GpuErrorCheck(cudaDeviceSynchronize());
601-
597+
#ifdef USE_CUFFT
598+
// Compute fft values at interpolated nodes
599+
cufftExecR2C(plan_dft,
600+
reinterpret_cast<cufftReal *>(thrust::raw_pointer_cast(fft_input.data())),
601+
reinterpret_cast<cufftComplex *>(thrust::raw_pointer_cast(fft_w_coefficients.data())));
602+
GpuErrorCheck(cudaDeviceSynchronize());
603+
#else
602604
int num_rows = n_fft_coeffs;
603605
int num_cols = n_fft_coeffs;
604606

@@ -629,6 +631,7 @@ void tsnecuda::NbodyFFT2D(
629631
CUDA_CHECK_LAST_ERROR();
630632
GpuErrorCheck(cudaDeviceSynchronize());
631633
}
634+
#endif
632635

633636
// Take the broadcasted Hadamard product of a complex matrix and a complex vector
634637
// TODO: Check timing on this kernel
@@ -642,11 +645,13 @@ void tsnecuda::NbodyFFT2D(
642645
thrust::complex<float>(1.0f));
643646

644647
// Invert the computed values at the interpolated nodes
645-
// cufftExecC2R(plan_idft,
646-
// reinterpret_cast<cufftComplex *>(thrust::raw_pointer_cast(fft_w_coefficients.data())),
647-
// reinterpret_cast<cufftReal *>(thrust::raw_pointer_cast(fft_output.data())));
648-
// GpuErrorCheck(cudaDeviceSynchronize());
649648

649+
#ifdef USE_CUFFT
650+
cufftExecC2R(plan_idft,
651+
reinterpret_cast<cufftComplex *>(thrust::raw_pointer_cast(fft_w_coefficients.data())),
652+
reinterpret_cast<cufftReal *>(thrust::raw_pointer_cast(fft_output.data())));
653+
GpuErrorCheck(cudaDeviceSynchronize());
654+
#else
650655
din = reinterpret_cast<float*>(thrust::raw_pointer_cast(fft_output.data()));
651656

652657
for (int f = 0; f < n_terms; ++f) {
@@ -668,6 +673,9 @@ void tsnecuda::NbodyFFT2D(
668673
CUDA_CHECK_LAST_ERROR();
669674
GpuErrorCheck(cudaDeviceSynchronize());
670675
}
676+
#endif
677+
678+
#undef USE_CUFFT
671679

672680
copy_from_fft_output<<<num_blocks, num_threads>>>(
673681
thrust::raw_pointer_cast(y_tilde_values.data()), // output

0 commit comments

Comments
 (0)