add the file

A-006 · A-006 · commit 12c1c40ccec6 · 2025-06-05T16:16:21.000+08:00
diff --git a/source/module_basis/module_pw/kernels/cuda/pw_op.cu b/source/module_basis/module_pw/kernels/cuda/pw_op.cu
@@ -41,7 +41,6 @@ __global__ void set_3d_fft_box_batch(
     thrust::complex<FPTYPE>* batch_out = out + batch_idx * nxyz;
     
     const int box_idx = box_index[element_idx];  
-    printf("the batch_idx is %d, the element_idx is %d, the box_idx is %d\n", batch_idx, element_idx, box_idx);
     const thrust::complex<FPTYPE> input_val = batch_in[element_idx];
     batch_out[box_idx] = input_val;
 }
diff --git a/source/module_basis/module_pw/module_fft/fft_bundle.cpp b/source/module_basis/module_pw/module_fft/fft_bundle.cpp
@@ -45,7 +45,7 @@ void FFT_Bundle::initfft(int nx_in,
                          bool xprime_in,
                          bool mpifft_in)
 {
-    assert(this->device == "cpu" || this->device == "gpu" || this->device == "dsp");
+    assert(this->device == "cpu" || this->device == "gpu" || this->device == "dsp" || this->device == "gpu_batch");
     assert(this->precision == "single" || this->precision == "double" || this->precision == "mixing");
 
     if (this->precision == "single" || this->precision == "mixing")
@@ -101,11 +101,25 @@ void FFT_Bundle::initfft(int nx_in,
         fft_double = make_unique<FFT_ROCM<double>>();
         fft_double->initfft(nx_in, ny_in, nz_in);
 #elif defined(__CUDA)
+        std::cout<<"here is the set of the gpu"<<std::endl;
         fft_float = make_unique<FFT_CUDA<float>>();
         fft_float->initfft(nx_in, ny_in, nz_in);
+        fft_double = make_unique<FFT_CUDA<double>>();
+        fft_double->initfft(nx_in, ny_in, nz_in );
+#endif
+    }else if (device == "gpu_batch")
+    {
+#if defined(__ROCM)
+        fft_float = make_unique<FFT_ROCM<float>>();
+        fft_float->initfft(nx_in, ny_in, nz_in);
+        fft_double = make_unique<FFT_ROCM<double>>();
+        fft_double->initfft(nx_in, ny_in, nz_in);
+#elif defined(__CUDA)   
+        std::cout<<"here is the set of the batch gpu"<<std::endl;
+        fft_float = make_unique<FFT_CUDA_BATCH<float>>();
+        fft_float->initfft(nx_in, ny_in, nz_in);
         fft_double = make_unique<FFT_CUDA_BATCH<double>>();
         fft_double->initfft(nx_in, ny_in, nz_in );
-
 #endif
     }else{
         // ModuleBase::WARNING_QUIT("FFT_Bundle", "Please set the device to cpu or gpu or dsp");
@@ -238,6 +252,7 @@ template <>
 void FFT_Bundle::fft3D_forward(std::complex<double>* in,
                                std::complex<double>* out) const
 {
+    std::cout<<"FFT_Bundle::fft3D_forward<double> in FFT_bundle"<<std::endl;
     fft_double->fft3D_forward(in, out);
 }
 
diff --git a/source/module_basis/module_pw/module_fft/fft_cuda.cpp b/source/module_basis/module_pw/module_fft/fft_cuda.cpp
@@ -74,10 +74,12 @@ void FFT_CUDA<float>::fft3D_forward(std::complex<float>* in, std::complex<float>
 template <>
 void FFT_CUDA<double>::fft3D_forward(std::complex<double>* in, std::complex<double>* out) const
 {
+    std::cout<<"FFT_CUDA<double>::fft3D_forward"<<std::endl;
     CHECK_CUFFT(cufftExecZ2Z(this->z_handle,
                              reinterpret_cast<cufftDoubleComplex*>(in),
                              reinterpret_cast<cufftDoubleComplex*>(out),
                              CUFFT_FORWARD));
+    cudaCheckOnDebug();
 }
 template <>
 void FFT_CUDA<float>::fft3D_backward(std::complex<float>* in, std::complex<float>* out) const
diff --git a/source/module_basis/module_pw/module_fft/fft_cuda_batch.cpp b/source/module_basis/module_pw/module_fft/fft_cuda_batch.cpp
@@ -28,8 +28,6 @@ void FFT_CUDA_BATCH<float>::setupFFT()
 template <>
 void FFT_CUDA_BATCH<double>::setupFFT()
 {
-    std::cout<<"the nx ,ny,nz,batch is: "
-             <<this->nx<<" "<<this->ny<<" "<<this->nz<<" "<<this->batch<<std::endl;
     int rank = 3;                
     int n[3] = {this->nx, this->ny, this->nz};       
     const int size = this->nx* this->ny *this->nz; 
diff --git a/source/module_basis/module_pw/test_gpu/pw_basis_k_batch.cpp b/source/module_basis/module_pw/test_gpu/pw_basis_k_batch.cpp
@@ -3,6 +3,7 @@
 #include "module_base/module_device/device.h"
 #include "module_base/vector3.h"
 #include "module_basis/module_pw/pw_basis_k.h"
+#include "module_basis/module_pw/module_fft/fft_bundle.h"
 #include "pw_test.h"
 #include <complex>
 #include <vector>
@@ -14,45 +15,132 @@ using namespace std;
 class PW_BASIS_K_BATCH_GPU_TEST : public ::testing::Test
 {
     public:
-        const int batch = 10; // Number of batches
+        const int batch = 1;  // Number of batches
         const int npwk = 30;   // Number of planewaves
         const int nxyz = 1000; // Size of the 3D grid
         std::vector<int> box_index;  // Index mapping for 3D grid
         int* d_box_index=nullptr; // Device memory for box_index
-        std::vector<std::complex<double>> rhog;   // Input data for the test,
+        std::vector<std::complex<double>> rhog; // rhoG(K space) data for the test,
         std::complex<double>* d_rhog = nullptr; // Device memory for rhoG data
-        std::vector<std::complex<double>> rhor = nullptr; // Device memory for output rhoG data
-        std::complex<double>* d_rhor = nullptr; // Device memory for output data
+        std::complex<double>* d_rhog_batch = nullptr; // Device memory for rhoG output data
+        std::vector<std::complex<double>> rhor; // Device memory for output rhoR(R space) data
+        std::complex<double>* d_rhor = nullptr; // Device memory for rhoR data
+        std::complex<double>* d_rhor_batch = nullptr; // Device memory for rhoR output batch data
+        ModulePW::FFT_Bundle ft_gpu;            // FFT bundle for 3D FFT operations on GPU
+        ModulePW::FFT_Bundle ft_gpu_batch;      // FFT bundle for 3D FFT operations on batch-GPU
     void SetUp() override
     {
         box_index.resize(npwk);
-        rhog.resize(npwk);
+        rhog.resize(npwk * batch);
+        rhor.resize(nxyz * batch);
+
         resize_memory_int_gpu_op()(d_box_index, npwk);
-        resize_memory_complex_gpu_op()(d_rhog, npwk);
+        resize_memory_complex_gpu_op()(d_rhog, npwk * batch);
+        resize_memory_complex_gpu_op()(d_rhor, nxyz * batch);
+        resize_memory_complex_gpu_op()(d_rhog_batch, npwk * batch);
+        resize_memory_complex_gpu_op()(d_rhor_batch, nxyz * batch);
+
         // Initialize the box_index and input with some values
         int idx = 0;
         std::generate_n(box_index.begin(), npwk, [&idx] { return idx * idx++; });
         idx =0;
-        std::generate_n(rhog.begin(), npwk, [&idx] { return std::complex<double>(sqrt(idx), 1/(idx+1)); });
+        int npwk = box_index.size();
+        // Initialize rhog with some complex values,it generates a complex number 
+        // with real part as sqrt(idx) and imaginary part as 1/(idx+1),
+        // thus in different batches the values of rhog will be different.
+        std::generate_n(rhog.begin(), npwk * batch, [&idx,npwk] 
+        {   
+            idx ++;
+            return std::complex<double>(std::sqrt(idx), 1.0/(idx+1));
+        });
         synchronize_memory_int_h2d_op()(d_box_index, box_index.data(), npwk);
-        synchronize_memory_complex_h2d_op()(d_rhog, rhog.data(), npwk);
+        synchronize_memory_complex_h2d_op()(d_rhog, rhog.data(), npwk * batch);
+        synchronize_memory_complex_h2d_op()(d_rhog_batch, rhog.data(), npwk * batch);
         // Initialize the box_index with some values
-        
-        // resize_memory_int_gpu_op
+        ft_gpu.setfft("gpu", "double");
+        ft_gpu.initfft(10, 10, 10 , 1, 1, 1, 1, 1, 1);
+        ft_gpu.setupFFT();
+        ft_gpu_batch.setfft("gpu", "double");
+        ft_gpu_batch.initfft(10, 10, 10 , 1, 1, 1, 1, 1, 1);
+        ft_gpu_batch.setupFFT();
     }
     void TearDown() override
     {
         box_index.clear();
         rhog.clear();
+        rhor.clear();
         delete_memory_int_gpu_op()(d_box_index);
         delete_memory_complex_gpu_op()(d_rhog);
+        delete_memory_complex_gpu_op()(d_rhor);
+        ft_gpu.clear();
+        ft_gpu_batch.clear();
     }
 };
 
 TEST_F(PW_BASIS_K_BATCH_GPU_TEST,convulution)
 {
+    // STEP 1 set the 3D FFT box operation for CPU
     for (int i = 0; i < npwk; ++i)
     {
         EXPECT_EQ(box_index[i], i * i);
     }
+
+    // STEP 2 check the input rhog has been
+    // correctly mapped to the 3D grid
+    std::vector<std::complex<double>> compute_rhor(nxyz * batch);
+    std::vector<std::complex<double>> compute_rhor_batch(nxyz * batch);
+    for (int i = 0; i< batch; i++)
+    {
+        ModulePW::set_3d_fft_box_op<double, 
+            base_device::DEVICE_GPU>()
+        (
+            npwk,
+            d_box_index,
+            d_rhog + i * npwk,
+            d_rhor + i * nxyz
+        );
+        synchronize_memory_complex_d2h_op()(compute_rhor.data()+i * nxyz, d_rhor + i *nxyz, nxyz);
+    }
+    ModulePW::set_3d_fft_box_op<double, 
+        base_device::DEVICE_GPU>()
+    (
+        npwk,
+        nxyz,
+        d_box_index,
+        d_rhog_batch,
+        d_rhor_batch,
+        batch
+    );
+    
+    synchronize_memory_complex_d2h_op()(compute_rhor_batch.data(), d_rhor_batch,nxyz * batch);
+    for (int i = 0; i < nxyz*batch ; ++i)
+    {
+        EXPECT_NEAR(compute_rhor[i].real(), compute_rhor_batch[i].real(), 1e-7);
+        EXPECT_NEAR(compute_rhor[i].imag(), compute_rhor_batch[i].imag(), 1e-7);
+    }
+
+    // STEP 3 perform the 3D FFT forward operation
+    std::vector<std::complex<double>> compute_rhor1(nxyz * batch,0);
+    std::vector<std::complex<double>> compute_rhor_batch1(nxyz * batch,0);
+
+    for (int i=0;i< batch; i++)
+    {
+        ft_gpu.fft3D_forward(d_rhor, d_rhor);
+        synchronize_memory_complex_d2h_op()(compute_rhor1.data(), d_rhor , nxyz);
+    }
+    // ft_gpu.fft3D_backward(d_rhor, d_rhor);
+    for (int i=0;i< batch; i++)
+    {
+        ft_gpu_batch.fft3D_forward(d_rhor_batch, d_rhor_batch);
+        synchronize_memory_complex_d2h_op()(compute_rhor_batch1.data(), d_rhor_batch , nxyz);
+    }
+    // ft_gpu.fft3D_forward(d_rhor_batch, d_rhor_batch);
+    // synchronize_memory_complex_d2h_op()(compute_rhor_batch.data(), d_rhor_batch,nxyz * batch);
+
+    for (int i = 0; i < nxyz *batch ; ++i)
+    {
+        EXPECT_NEAR(compute_rhor1[i].real(), compute_rhor_batch1[i].real(), 1e-4);
+        // EXPECT_NEAR(compute_rhor[i].imag(), compute_rhor_batch[i].imag(), 1e-4);
+    }
+    
 }

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,6 @@ __global__ void set_3d_fft_box_batch(`
`41`	`41`	`thrust::complex<FPTYPE>* batch_out = out + batch_idx * nxyz;`
`42`	`42`
`43`	`43`	`const int box_idx = box_index[element_idx];`
`44`		`- printf("the batch_idx is %d, the element_idx is %d, the box_idx is %d\n", batch_idx, element_idx, box_idx);`
`45`	`44`	`const thrust::complex<FPTYPE> input_val = batch_in[element_idx];`
`46`	`45`	`batch_out[box_idx] = input_val;`
`47`	`46`	`}`
Original file line number	Diff line number	Diff line change
`@@ -74,10 +74,12 @@ void FFT_CUDA<float>::fft3D_forward(std::complex<float>* in, std::complex<float>`
`74`	`74`	`template <>`
`75`	`75`	`void FFT_CUDA<double>::fft3D_forward(std::complex<double>* in, std::complex<double>* out) const`
`76`	`76`	`{`
	`77`	`+ std::cout<<"FFT_CUDA<double>::fft3D_forward"<<std::endl;`
`77`	`78`	`CHECK_CUFFT(cufftExecZ2Z(this->z_handle,`
`78`	`79`	`reinterpret_cast<cufftDoubleComplex*>(in),`
`79`	`80`	`reinterpret_cast<cufftDoubleComplex*>(out),`
`80`	`81`	`CUFFT_FORWARD));`
	`82`	`+ cudaCheckOnDebug();`
`81`	`83`	`}`
`82`	`84`	`template <>`
`83`	`85`	`void FFT_CUDA<float>::fft3D_backward(std::complex<float>* in, std::complex<float>* out) const`
Original file line number	Diff line number	Diff line change
`@@ -28,8 +28,6 @@ void FFT_CUDA_BATCH<float>::setupFFT()`
`28`	`28`	`template <>`
`29`	`29`	`void FFT_CUDA_BATCH<double>::setupFFT()`
`30`	`30`	`{`
`31`		`- std::cout<<"the nx ,ny,nz,batch is: "`
`32`		`- <<this->nx<<" "<<this->ny<<" "<<this->nz<<" "<<this->batch<<std::endl;`
`33`	`31`	`int rank = 3;`
`34`	`32`	`int n[3] = {this->nx, this->ny, this->nz};`
`35`	`33`	`const int size = this->nx* this->ny *this->nz;`