abacusmodeling
diff --git a/‎source/module_pw/fft.cpp‎
Lines changed: 22 additions & 10 deletions b/‎source/module_pw/fft.cpp‎
Lines changed: 22 additions & 10 deletions
diff --git a/‎source/module_pw/fft.h‎
Lines changed: 6 additions & 4 deletions b/‎source/module_pw/fft.h‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎source/module_pw/pw_basis.cpp‎
Lines changed: 4 additions & 5 deletions b/‎source/module_pw/pw_basis.cpp‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎source/module_pw/pw_basis_k.cpp‎
Lines changed: 17 additions & 1 deletion b/‎source/module_pw/pw_basis_k.cpp‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎source/module_pw/pw_basis_k_big.h‎
Lines changed: 1 addition & 0 deletions b/‎source/module_pw/pw_basis_k_big.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎source/module_pw/pw_transform.cpp‎
Lines changed: 2 additions & 2 deletions b/‎source/module_pw/pw_transform.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎source/module_pw/test/CMakeLists.txt‎
Lines changed: 4 additions & 4 deletions b/‎source/module_pw/test/CMakeLists.txt‎
Lines changed: 4 additions & 4 deletions
@@ -35,6 +35,7 @@ void FFT::clear()
 	if(z_auxg!=nullptr) {fftw_free(z_auxg); z_auxg = nullptr;}
 	if(z_auxr!=nullptr) {fftw_free(z_auxr); z_auxr = nullptr;}
 	d_rspace = nullptr;
+#if defined(__CUDA) || defined(__ROCM)
     if (GlobalV::device_flag == "gpu") {
         if (GlobalV::precision_flag == "single") {
             if (c_auxr_3d != nullptr) {
@@ -49,6 +50,7 @@ void FFT::clear()
             }
         }
     }
+#endif
     if (GlobalV::precision_flag == "single") {
         this->cleanfFFT();
         if (c_auxg != nullptr) {
@@ -95,6 +97,7 @@ void FFT:: initfft(int nx_in, int ny_in, int nz_in, int lixy_in, int rixy_in, in
 		d_rspace = (double *) z_auxg;
         // auxr_3d = static_cast<std::complex<double> *>(
         //     fftw_malloc(sizeof(fftw_complex) * (this->nx * this->ny * this->nz)));
+#if defined(__CUDA) || defined(__ROCM)
         if (GlobalV::device_flag == "gpu") {
             if (GlobalV::precision_flag == "single") {
                 resmem_cd_op()(gpu_ctx, this->c_auxr_3d, this->nx * this->ny * this->nz);
@@ -103,6 +106,7 @@ void FFT:: initfft(int nx_in, int ny_in, int nz_in, int lixy_in, int rixy_in, in
                 resmem_zd_op()(gpu_ctx, this->z_auxr_3d, this->nx * this->ny * this->nz);
             }
         }
+#endif
         if (GlobalV::precision_flag == "single") {
             c_auxg  = (std::complex<float> *) fftw_malloc(sizeof(fftwf_complex) * maxgrids);
             c_auxr  = (std::complex<float> *) fftw_malloc(sizeof(fftwf_complex) * maxgrids);
@@ -129,10 +133,10 @@ void FFT:: setupFFT()
 #if defined(__FFTW3_MPI) && defined(__MPI)
 	else
 	{
-		this->initplan_mpi();
-        if (GlobalV::precision_flag == "single") {
-		    this->initplanf_mpi();
-        }
+		// this->initplan_mpi();
+        // if (GlobalV::precision_flag == "single") {
+		//     this->initplanf_mpi();
+        // }
 	}
 #endif
 	return;
@@ -227,6 +231,7 @@ void FFT :: initplan()
     //    reinterpret_cast<fftw_complex *>(auxr_3d),
     //    FFTW_BACKWARD, FFTW_MEASURE);
 
+#if defined(__CUDA) || defined(__ROCM)
     if (GlobalV::device_flag == "gpu") {
         if (GlobalV::precision_flag == "single") {
         #if defined(__CUDA)
@@ -243,6 +248,7 @@ void FFT :: initplan()
         #endif
         }
     }
+#endif
 
 	destroyp = false;
 }
@@ -320,15 +326,15 @@ void FFT :: initplanf()
 	destroypf = false;
 }
 
-void FFT :: initplan_mpi()
-{
+// void FFT :: initplan_mpi()
+// {
 
-}
+// }
 
-void FFT :: initplanf_mpi()
-{
+// void FFT :: initplanf_mpi()
+// {
 
-}
+// }
 
 void FFT:: cleanFFT()
 {
@@ -369,6 +375,7 @@ void FFT:: cleanFFT()
 	}
     // fftw_destroy_plan(this->plan3dforward);
     // fftw_destroy_plan(this->plan3dbackward);
+#if defined(__CUDA) || defined(__ROCM)
     if (GlobalV::device_flag == "gpu") {
         if (GlobalV::precision_flag == "single") {
         #if defined(__CUDA)
@@ -385,6 +392,7 @@ void FFT:: cleanFFT()
         #endif
         }
     }
+#endif
 	destroyp = true;
 }
 
@@ -659,6 +667,7 @@ void FFT::fftxyc2r(std::complex<double> * in, double * out)
 	}
 }
 
+#if defined(__CUDA) || defined(__ROCM)
 template <>
 void FFT::fft3D_forward(const psi::DEVICE_GPU * /*ctx*/, std::complex<float> * in, std::complex<float> * out)
 {
@@ -728,6 +737,7 @@ void FFT::fft3D_backward(const psi::DEVICE_GPU * /*ctx*/, std::complex<double>*
     hipDeviceSynchronize();
 #endif
 }
+#endif
 
 
 template <>
@@ -757,6 +767,7 @@ std::complex<double> * FFT::get_auxg_data() {
     return this->z_auxg;
 }
 
+#if defined(__CUDA) || defined(__ROCM)
 template <>
 std::complex<float> * FFT::get_auxr_3d_data() {
     return this->c_auxr_3d;
@@ -765,5 +776,6 @@ template <>
 std::complex<double> * FFT::get_auxr_3d_data() {
     return this->z_auxr_3d;
 }
+#endif
 
 }
@@ -20,6 +20,7 @@
 #include <hip/hip_runtime.h>
 #endif
 
+//Temporary: we donot need psi. However some GPU ops are defined in psi, which should be moved into module_base or module_gpu
 #include "module_psi/psi.h"
 // #ifdef __MIX_PRECISION
 // #include "fftw3f.h"
@@ -63,11 +64,12 @@ class FFT
 
 public:
 	//init fftw_plans
-	void initplan(); 
-	void initplan_mpi();
+	void initplan();
+	// We have not support mpi fftw yet.
+	// void initplan_mpi();
 	//init fftwf_plans
-	void initplanf(); 
-	void initplanf_mpi();
+	void initplanf();
+	// void initplanf_mpi();
 
 public:
 	int fftnx=0, fftny=0;
 
@@ -30,9 +30,11 @@ PW_Basis:: ~PW_Basis()
     delete[] startr;
     delete[] ig2igg;
     delete[] gg_uniq;
+#if defined(__CUDA) || defined(__ROCM)
     if (GlobalV::device_flag == "gpu") {
         delmem_int_op()(gpu_ctx, this->d_is2fftixy);
     }
+#endif
 }
 
 /// 
@@ -169,7 +171,7 @@ void PW_Basis::collect_uniqgg()
     {
         if (std::abs(tmpgg[ig] - tmpgg2[igg]) > 1.0e-8)
         {
-            tmpgg2[igg] = avg_gg / double(avg_n) ;
+            tmpgg2[igg] = avg_gg / double(avg_n);
             ++igg;
             tmpgg2[igg] = tmpgg[ig];
             avg_gg = tmpgg2[igg];
@@ -181,11 +183,8 @@ void PW_Basis::collect_uniqgg()
             avg_gg += tmpgg[ig];
         }
         this->ig2igg[sortindex[ig]] = igg;
-        if(ig == this->npw)
-        {
-            tmpgg2[igg] = avg_gg / double(avg_n) ;
-        }
     }
+    tmpgg2[igg] = avg_gg / double(avg_n);
     this->ngg = igg + 1;
     delete[] this->gg_uniq; this->gg_uniq = new double [this->ngg];
     for(int igg = 0 ; igg < this->ngg ; ++igg)
 
@@ -19,6 +19,7 @@ PW_Basis_K::~PW_Basis_K()
     delete[] igl2ig_k;
     delete[] gk2;
     delete[] ig2ixyz_k_;
+#if defined(__CUDA) || defined(__ROCM)
     if (GlobalV::device_flag == "gpu") {
         if (GlobalV::precision_flag == "single") {
             delmem_sd_op()(gpu_ctx, this->s_kvec_c);
@@ -34,13 +35,16 @@ PW_Basis_K::~PW_Basis_K()
         delmem_int_op()(gpu_ctx, this->d_igl2isz_k);
     }
     else {
+#endif
         if (GlobalV::precision_flag == "single") {
             delmem_sh_op()(cpu_ctx, this->s_kvec_c);
             delmem_sh_op()(cpu_ctx, this->s_gcar);
             delmem_sh_op()(cpu_ctx, this->s_gk2);
         }
         // There's no need to delete double pointers while in a CPU environment.
+#if defined(__CUDA) || defined(__ROCM)
     }
+#endif
 }
 
 void PW_Basis_K:: initparameters(
@@ -86,6 +90,7 @@ void PW_Basis_K:: initparameters(
     this->fftnxy = this->fftnx * this->fftny;
     this->fftnxyz = this->fftnxy * this->fftnz;
     this->distribution_type = distribution_type_in;
+#if defined(__CUDA) || defined(__ROCM)
     if (GlobalV::device_flag == "gpu") {
         if (GlobalV::precision_flag == "single") {
             resmem_sd_op()(gpu_ctx, this->s_kvec_c, this->nks * 3);
@@ -97,6 +102,7 @@ void PW_Basis_K:: initparameters(
         }
     }
     else {
+#endif
         if (GlobalV::precision_flag == "single") {
             resmem_sh_op()(cpu_ctx, this->s_kvec_c, this->nks * 3);
             castmem_d2s_h2h_op()(cpu_ctx, cpu_ctx, this->s_kvec_c, reinterpret_cast<double *>(&this->kvec_c[0][0]), this->nks * 3);
@@ -105,7 +111,9 @@ void PW_Basis_K:: initparameters(
             this->d_kvec_c = reinterpret_cast<double *>(&this->kvec_c[0][0]);
         }
         // There's no need to allocate double pointers while in a CPU environment.
+#if defined(__CUDA) || defined(__ROCM)
     }
+#endif
 }
 
 void PW_Basis_K::setupIndGk()
@@ -151,10 +159,12 @@ void PW_Basis_K::setupIndGk()
             }
         }
     }
+#if defined(__CUDA) || defined(__ROCM)
     if (GlobalV::device_flag == "gpu") {
         resmem_int_op()(gpu_ctx, this->d_igl2isz_k, this->npwk_max * this->nks);
         syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, this->d_igl2isz_k, this->igl2isz_k, this->npwk_max * this->nks);
     }
+#endif
     return;
 }
 
@@ -210,6 +220,7 @@ void PW_Basis_K::collect_local_pw()
             this->gcar[ik * npwk_max + igl] = f * this->G;
         }
     }
+#if defined(__CUDA) || defined(__ROCM)
     if (GlobalV::device_flag == "gpu") {
         if (GlobalV::precision_flag == "single") {
             resmem_sd_op()(gpu_ctx, this->s_gk2, this->npwk_max * this->nks);
@@ -225,6 +236,7 @@ void PW_Basis_K::collect_local_pw()
         }
     }
     else {
+#endif
         if (GlobalV::precision_flag == "single") {
             resmem_sh_op()(cpu_ctx, this->s_gk2, this->npwk_max * this->nks, "PW_B_K::s_gk2");
             resmem_sh_op()(cpu_ctx, this->s_gcar, this->npwk_max * this->nks * 3, "PW_B_K::s_gcar");
@@ -236,7 +248,9 @@ void PW_Basis_K::collect_local_pw()
             this->d_gk2 = this->gk2;
         }
         // There's no need to allocate double pointers while in a CPU environment.
+#if defined(__CUDA) || defined(__ROCM)
     }
+#endif
 }
 
 ModuleBase::Vector3<double> PW_Basis_K:: cal_GplusK_cartesian(const int ik, const int ig) const {
@@ -293,7 +307,7 @@ int& PW_Basis_K::getigl2ig(const int ik, const int igl) const
 
 void PW_Basis_K::get_ig2ixyz_k()
 {
-
+    delete[] this->ig2ixyz_k_;
     this->ig2ixyz_k_ = new int [this->npwk_max * this->nks];
     ModuleBase::Memory::record("PW_B_K::ig2ixyz", sizeof(int) * this->npwk_max * this->nks);
     assert(gamma_only == false); //We only finish non-gamma_only fft on GPU temperarily.
@@ -310,10 +324,12 @@ void PW_Basis_K::get_ig2ixyz_k()
             ig2ixyz_k_[igl + ik * npwk_max] = iz + iy * nz + ix * ny * nz;
         }
     }
+#if defined(__CUDA) || defined(__ROCM)
     if (GlobalV::device_flag == "gpu") {
         resmem_int_op()(gpu_ctx, ig2ixyz_k, this->npwk_max * this->nks);
         syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, this->ig2ixyz_k, this->ig2ixyz_k_, this->npwk_max * this->nks);
     }
+#endif
 }
 
 template <>
 
@@ -16,6 +16,7 @@ class PW_Basis_K_Big: public PW_Basis_K
     // combine [bx,by,bz] FFT grids into a big one
 	// typical values are bx=2, by=2, bz=2
 	// nbx=nx/bx, nby=ny/by, nbz=nz/bz, 
+    // Note: this class can only use initgrids(lat0_in, latvec_in, PW_Basis_Big::nx, PW_Basis_Big::ny, PW_Basis_Big::nz)!!!
     PW_Basis_K_Big(){
         bx = 1;
         by = 1;
 
@@ -91,7 +91,7 @@ void PW_Basis:: real2recip(const FPTYPE * in, std::complex<FPTYPE> * out, const
 #endif
         for(int ir = 0 ; ir < this->nrxx ; ++ir)
         {
-            this->ft.get_auxr_data<FPTYPE>()[ir] = std::complex<double>(in[ir],0);
+            this->ft.get_auxr_data<FPTYPE>()[ir] = std::complex<FPTYPE>(in[ir],0);
         }
         this->ft.fftxyfor(ft.get_auxr_data<FPTYPE>(),ft.get_auxr_data<FPTYPE>());
     }
@@ -141,7 +141,7 @@ void PW_Basis:: recip2real(const std::complex<FPTYPE> * in, std::complex<FPTYPE>
 #endif
     for(int i = 0 ; i < this->nst * this->nz ; ++i)
     {
-        ft.get_auxg_data<FPTYPE>()[i] = std::complex<double>(0, 0);
+        ft.get_auxg_data<FPTYPE>()[i] = std::complex<FPTYPE>(0, 0);
     }
 
 #ifdef _OPENMP
 
@@ -1,4 +1,4 @@
-add_definitions(-D__NORMAL)
+add_definitions(-D__NORMAL -D__MIX_PRECISION)
 AddTest(
   TARGET pw_test
   LIBS ${math_libs} planewave psi device
@@ -8,16 +8,16 @@ AddTest(
           ../../src_parallel/parallel_global.cpp ../../src_parallel/parallel_reduce.cpp
           pw_test.cpp test1-1-1.cpp test1-1-2.cpp test1-2.cpp test1-3.cpp test1-4.cpp  test1-5.cpp
           test2-1-1.cpp test2-1-2.cpp test2-2.cpp test2-3.cpp 
-          test3-1.cpp test3-2.cpp test3-3.cpp 
+          test3-1.cpp test3-2.cpp test3-3.cpp test3-3-2.cpp 
           test4-1.cpp test4-2.cpp test4-3.cpp test4-4.cpp  test4-5.cpp
           test5-1-1.cpp test5-1-2.cpp test5-2-1.cpp test5-2-2.cpp test5-3-1.cpp test5-4-1.cpp test5-4-2.cpp 
           test6-1-1.cpp test6-1-2.cpp test6-2-1.cpp test6-2-2.cpp test6-3-1.cpp test6-4-1.cpp test6-4-2.cpp 
           test7-1.cpp test6-2-1.cpp test7-3-1.cpp test7-3-2.cpp
           test8-1.cpp test8-2-1.cpp test8-3-1.cpp test8-3-2.cpp
-          test_tool.cpp
+          test_tool.cpp test-big.cpp test-other.cpp 
 )
 
 add_test(NAME pw_test_parallel
-      COMMAND mpirun -np 2 ./pw_test; mpirun -np 3./pw_test
+      COMMAND mpirun -np 3 ./pw_test; mpirun -np 4 ./pw_test
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
 )
Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@ void FFT::clear()`
`35`	`35`	`if(z_auxg!=nullptr) {fftw_free(z_auxg); z_auxg = nullptr;}`
`36`	`36`	`if(z_auxr!=nullptr) {fftw_free(z_auxr); z_auxr = nullptr;}`
`37`	`37`	`d_rspace = nullptr;`
	`38`	`+#if defined(__CUDA) \|\| defined(__ROCM)`
`38`	`39`	`if (GlobalV::device_flag == "gpu") {`
`39`	`40`	`if (GlobalV::precision_flag == "single") {`
`40`	`41`	`if (c_auxr_3d != nullptr) {`
`@@ -49,6 +50,7 @@ void FFT::clear()`
`49`	`50`	`}`
`50`	`51`	`}`
`51`	`52`	`}`
	`53`	`+#endif`
`52`	`54`	`if (GlobalV::precision_flag == "single") {`
`53`	`55`	`this->cleanfFFT();`
`54`	`56`	`if (c_auxg != nullptr) {`
`@@ -95,6 +97,7 @@ void FFT:: initfft(int nx_in, int ny_in, int nz_in, int lixy_in, int rixy_in, in`
`95`	`97`	`d_rspace = (double *) z_auxg;`
`96`	`98`	`// auxr_3d = static_cast<std::complex<double> *>(`
`97`	`99`	`// fftw_malloc(sizeof(fftw_complex) * (this->nx * this->ny * this->nz)));`
	`100`	`+#if defined(__CUDA) \|\| defined(__ROCM)`
`98`	`101`	`if (GlobalV::device_flag == "gpu") {`
`99`	`102`	`if (GlobalV::precision_flag == "single") {`
`100`	`103`	`resmem_cd_op()(gpu_ctx, this->c_auxr_3d, this->nx * this->ny * this->nz);`
`@@ -103,6 +106,7 @@ void FFT:: initfft(int nx_in, int ny_in, int nz_in, int lixy_in, int rixy_in, in`
`103`	`106`	`resmem_zd_op()(gpu_ctx, this->z_auxr_3d, this->nx * this->ny * this->nz);`
`104`	`107`	`}`
`105`	`108`	`}`
	`109`	`+#endif`
`106`	`110`	`if (GlobalV::precision_flag == "single") {`
`107`	`111`	`c_auxg = (std::complex<float> ) fftw_malloc(sizeof(fftwf_complex) maxgrids);`
`108`	`112`	`c_auxr = (std::complex<float> ) fftw_malloc(sizeof(fftwf_complex) maxgrids);`
`@@ -129,10 +133,10 @@ void FFT:: setupFFT()`
`129`	`133`	`#if defined(__FFTW3_MPI) && defined(__MPI)`
`130`	`134`	`else`
`131`	`135`	`{`
`132`		`- this->initplan_mpi();`
`133`		`- if (GlobalV::precision_flag == "single") {`
`134`		`- this->initplanf_mpi();`
`135`		`- }`
	`136`	`+ // this->initplan_mpi();`
	`137`	`+ // if (GlobalV::precision_flag == "single") {`
	`138`	`+ // this->initplanf_mpi();`
	`139`	`+ // }`
`136`	`140`	`}`
`137`	`141`	`#endif`
`138`	`142`	`return;`
`@@ -227,6 +231,7 @@ void FFT :: initplan()`
`227`	`231`	`// reinterpret_cast<fftw_complex *>(auxr_3d),`
`228`	`232`	`// FFTW_BACKWARD, FFTW_MEASURE);`
`229`	`233`
	`234`	`+#if defined(__CUDA) \|\| defined(__ROCM)`
`230`	`235`	`if (GlobalV::device_flag == "gpu") {`
`231`	`236`	`if (GlobalV::precision_flag == "single") {`
`232`	`237`	`#if defined(__CUDA)`
`@@ -243,6 +248,7 @@ void FFT :: initplan()`
`243`	`248`	`#endif`
`244`	`249`	`}`
`245`	`250`	`}`
	`251`	`+#endif`
`246`	`252`
`247`	`253`	`destroyp = false;`
`248`	`254`	`}`
`@@ -320,15 +326,15 @@ void FFT :: initplanf()`
`320`	`326`	`destroypf = false;`
`321`	`327`	`}`
`322`	`328`
`323`		`-void FFT :: initplan_mpi()`
`324`		`-{`
	`329`	`+// void FFT :: initplan_mpi()`
	`330`	`+// {`
`325`	`331`
`326`		`-}`
	`332`	`+// }`
`327`	`333`
`328`		`-void FFT :: initplanf_mpi()`
`329`		`-{`
	`334`	`+// void FFT :: initplanf_mpi()`
	`335`	`+// {`
`330`	`336`
`331`		`-}`
	`337`	`+// }`
`332`	`338`
`333`	`339`	`void FFT:: cleanFFT()`
`334`	`340`	`{`
`@@ -369,6 +375,7 @@ void FFT:: cleanFFT()`
`369`	`375`	`}`
`370`	`376`	`// fftw_destroy_plan(this->plan3dforward);`
`371`	`377`	`// fftw_destroy_plan(this->plan3dbackward);`
	`378`	`+#if defined(__CUDA) \|\| defined(__ROCM)`
`372`	`379`	`if (GlobalV::device_flag == "gpu") {`
`373`	`380`	`if (GlobalV::precision_flag == "single") {`
`374`	`381`	`#if defined(__CUDA)`
`@@ -385,6 +392,7 @@ void FFT:: cleanFFT()`
`385`	`392`	`#endif`
`386`	`393`	`}`
`387`	`394`	`}`
	`395`	`+#endif`
`388`	`396`	`destroyp = true;`
`389`	`397`	`}`
`390`	`398`
`@@ -659,6 +667,7 @@ void FFT::fftxyc2r(std::complex<double> * in, double * out)`
`659`	`667`	`}`
`660`	`668`	`}`
`661`	`669`
	`670`	`+#if defined(__CUDA) \|\| defined(__ROCM)`
`662`	`671`	`template <>`
`663`	`672`	`void FFT::fft3D_forward(const psi::DEVICE_GPU * /ctx/, std::complex<float> * in, std::complex<float> * out)`
`664`	`673`	`{`
`@@ -728,6 +737,7 @@ void FFT::fft3D_backward(const psi::DEVICE_GPU * /ctx/, std::complex<double>*`
`728`	`737`	`hipDeviceSynchronize();`
`729`	`738`	`#endif`
`730`	`739`	`}`
	`740`	`+#endif`
`731`	`741`
`732`	`742`
`733`	`743`	`template <>`
`@@ -757,6 +767,7 @@ std::complex<double> * FFT::get_auxg_data() {`
`757`	`767`	`return this->z_auxg;`
`758`	`768`	`}`
`759`	`769`
	`770`	`+#if defined(__CUDA) \|\| defined(__ROCM)`
`760`	`771`	`template <>`
`761`	`772`	`std::complex<float> * FFT::get_auxr_3d_data() {`
`762`	`773`	`return this->c_auxr_3d;`
`@@ -765,5 +776,6 @@ template <>`
`765`	`776`	`std::complex<double> * FFT::get_auxr_3d_data() {`
`766`	`777`	`return this->z_auxr_3d;`
`767`	`778`	`}`
	`779`	`+#endif`
`768`	`780`
`769`	`781`	`}`
Original file line number	Diff line number	Diff line change
`@@ -30,9 +30,11 @@ PW_Basis:: ~PW_Basis()`
`30`	`30`	`delete[] startr;`
`31`	`31`	`delete[] ig2igg;`
`32`	`32`	`delete[] gg_uniq;`
	`33`	`+#if defined(__CUDA) \|\| defined(__ROCM)`
`33`	`34`	`if (GlobalV::device_flag == "gpu") {`
`34`	`35`	`delmem_int_op()(gpu_ctx, this->d_is2fftixy);`
`35`	`36`	`}`
	`37`	`+#endif`
`36`	`38`	`}`
`37`	`39`
`38`	`40`	`///`
`@@ -169,7 +171,7 @@ void PW_Basis::collect_uniqgg()`
`169`	`171`	`{`
`170`	`172`	`if (std::abs(tmpgg[ig] - tmpgg2[igg]) > 1.0e-8)`
`171`	`173`	`{`
`172`		`- tmpgg2[igg] = avg_gg / double(avg_n) ;`
	`174`	`+ tmpgg2[igg] = avg_gg / double(avg_n);`
`173`	`175`	`++igg;`
`174`	`176`	`tmpgg2[igg] = tmpgg[ig];`
`175`	`177`	`avg_gg = tmpgg2[igg];`
`@@ -181,11 +183,8 @@ void PW_Basis::collect_uniqgg()`
`181`	`183`	`avg_gg += tmpgg[ig];`
`182`	`184`	`}`
`183`	`185`	`this->ig2igg[sortindex[ig]] = igg;`
`184`		`- if(ig == this->npw)`
`185`		`- {`
`186`		`- tmpgg2[igg] = avg_gg / double(avg_n) ;`
`187`		`- }`
`188`	`186`	`}`
	`187`	`+ tmpgg2[igg] = avg_gg / double(avg_n);`
`189`	`188`	`this->ngg = igg + 1;`
`190`	`189`	`delete[] this->gg_uniq; this->gg_uniq = new double [this->ngg];`
`191`	`190`	`for(int igg = 0 ; igg < this->ngg ; ++igg)`
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@ PW_Basis_K::~PW_Basis_K()`
`19`	`19`	`delete[] igl2ig_k;`
`20`	`20`	`delete[] gk2;`
`21`	`21`	`delete[] ig2ixyz_k_;`
	`22`	`+#if defined(__CUDA) \|\| defined(__ROCM)`
`22`	`23`	`if (GlobalV::device_flag == "gpu") {`
`23`	`24`	`if (GlobalV::precision_flag == "single") {`
`24`	`25`	`delmem_sd_op()(gpu_ctx, this->s_kvec_c);`
`@@ -34,13 +35,16 @@ PW_Basis_K::~PW_Basis_K()`
`34`	`35`	`delmem_int_op()(gpu_ctx, this->d_igl2isz_k);`
`35`	`36`	`}`
`36`	`37`	`else {`
	`38`	`+#endif`
`37`	`39`	`if (GlobalV::precision_flag == "single") {`
`38`	`40`	`delmem_sh_op()(cpu_ctx, this->s_kvec_c);`
`39`	`41`	`delmem_sh_op()(cpu_ctx, this->s_gcar);`
`40`	`42`	`delmem_sh_op()(cpu_ctx, this->s_gk2);`
`41`	`43`	`}`
`42`	`44`	`// There's no need to delete double pointers while in a CPU environment.`
	`45`	`+#if defined(__CUDA) \|\| defined(__ROCM)`
`43`	`46`	`}`
	`47`	`+#endif`
`44`	`48`	`}`
`45`	`49`
`46`	`50`	`void PW_Basis_K:: initparameters(`
`@@ -86,6 +90,7 @@ void PW_Basis_K:: initparameters(`
`86`	`90`	`this->fftnxy = this->fftnx * this->fftny;`
`87`	`91`	`this->fftnxyz = this->fftnxy * this->fftnz;`
`88`	`92`	`this->distribution_type = distribution_type_in;`
	`93`	`+#if defined(__CUDA) \|\| defined(__ROCM)`
`89`	`94`	`if (GlobalV::device_flag == "gpu") {`
`90`	`95`	`if (GlobalV::precision_flag == "single") {`
`91`	`96`	`resmem_sd_op()(gpu_ctx, this->s_kvec_c, this->nks * 3);`
`@@ -97,6 +102,7 @@ void PW_Basis_K:: initparameters(`
`97`	`102`	`}`
`98`	`103`	`}`
`99`	`104`	`else {`
	`105`	`+#endif`
`100`	`106`	`if (GlobalV::precision_flag == "single") {`
`101`	`107`	`resmem_sh_op()(cpu_ctx, this->s_kvec_c, this->nks * 3);`
`102`	`108`	`castmem_d2s_h2h_op()(cpu_ctx, cpu_ctx, this->s_kvec_c, reinterpret_cast<double >(&this->kvec_c[0][0]), this->nks 3);`
`@@ -105,7 +111,9 @@ void PW_Basis_K:: initparameters(`
`105`	`111`	`this->d_kvec_c = reinterpret_cast<double *>(&this->kvec_c[0][0]);`
`106`	`112`	`}`
`107`	`113`	`// There's no need to allocate double pointers while in a CPU environment.`
	`114`	`+#if defined(__CUDA) \|\| defined(__ROCM)`
`108`	`115`	`}`
	`116`	`+#endif`
`109`	`117`	`}`
`110`	`118`
`111`	`119`	`void PW_Basis_K::setupIndGk()`
`@@ -151,10 +159,12 @@ void PW_Basis_K::setupIndGk()`
`151`	`159`	`}`
`152`	`160`	`}`
`153`	`161`	`}`
	`162`	`+#if defined(__CUDA) \|\| defined(__ROCM)`
`154`	`163`	`if (GlobalV::device_flag == "gpu") {`
`155`	`164`	`resmem_int_op()(gpu_ctx, this->d_igl2isz_k, this->npwk_max * this->nks);`
`156`	`165`	`syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, this->d_igl2isz_k, this->igl2isz_k, this->npwk_max * this->nks);`
`157`	`166`	`}`
	`167`	`+#endif`
`158`	`168`	`return;`
`159`	`169`	`}`
`160`	`170`
`@@ -210,6 +220,7 @@ void PW_Basis_K::collect_local_pw()`
`210`	`220`	`this->gcar[ik * npwk_max + igl] = f * this->G;`
`211`	`221`	`}`
`212`	`222`	`}`
	`223`	`+#if defined(__CUDA) \|\| defined(__ROCM)`
`213`	`224`	`if (GlobalV::device_flag == "gpu") {`
`214`	`225`	`if (GlobalV::precision_flag == "single") {`
`215`	`226`	`resmem_sd_op()(gpu_ctx, this->s_gk2, this->npwk_max * this->nks);`
`@@ -225,6 +236,7 @@ void PW_Basis_K::collect_local_pw()`
`225`	`236`	`}`
`226`	`237`	`}`
`227`	`238`	`else {`
	`239`	`+#endif`
`228`	`240`	`if (GlobalV::precision_flag == "single") {`
`229`	`241`	`resmem_sh_op()(cpu_ctx, this->s_gk2, this->npwk_max * this->nks, "PW_B_K::s_gk2");`
`230`	`242`	`resmem_sh_op()(cpu_ctx, this->s_gcar, this->npwk_max * this->nks * 3, "PW_B_K::s_gcar");`
`@@ -236,7 +248,9 @@ void PW_Basis_K::collect_local_pw()`
`236`	`248`	`this->d_gk2 = this->gk2;`
`237`	`249`	`}`
`238`	`250`	`// There's no need to allocate double pointers while in a CPU environment.`
	`251`	`+#if defined(__CUDA) \|\| defined(__ROCM)`
`239`	`252`	`}`
	`253`	`+#endif`
`240`	`254`	`}`
`241`	`255`
`242`	`256`	`ModuleBase::Vector3<double> PW_Basis_K:: cal_GplusK_cartesian(const int ik, const int ig) const {`
`@@ -293,7 +307,7 @@ int& PW_Basis_K::getigl2ig(const int ik, const int igl) const`
`293`	`307`
`294`	`308`	`void PW_Basis_K::get_ig2ixyz_k()`
`295`	`309`	`{`
`296`		`-`
	`310`	`+ delete[] this->ig2ixyz_k_;`
`297`	`311`	`this->ig2ixyz_k_ = new int [this->npwk_max * this->nks];`
`298`	`312`	`ModuleBase::Memory::record("PW_B_K::ig2ixyz", sizeof(int) * this->npwk_max * this->nks);`
`299`	`313`	`assert(gamma_only == false); //We only finish non-gamma_only fft on GPU temperarily.`
`@@ -310,10 +324,12 @@ void PW_Basis_K::get_ig2ixyz_k()`
`310`	`324`	`ig2ixyz_k_[igl + ik * npwk_max] = iz + iy * nz + ix * ny * nz;`
`311`	`325`	`}`
`312`	`326`	`}`
	`327`	`+#if defined(__CUDA) \|\| defined(__ROCM)`
`313`	`328`	`if (GlobalV::device_flag == "gpu") {`
`314`	`329`	`resmem_int_op()(gpu_ctx, ig2ixyz_k, this->npwk_max * this->nks);`
`315`	`330`	`syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, this->ig2ixyz_k, this->ig2ixyz_k_, this->npwk_max * this->nks);`
`316`	`331`	`}`
	`332`	`+#endif`
`317`	`333`	`}`
`318`	`334`
`319`	`335`	`template <>`
Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,7 @@ void PW_Basis:: real2recip(const FPTYPE * in, std::complex<FPTYPE> * out, const`
`91`	`91`	`#endif`
`92`	`92`	`for(int ir = 0 ; ir < this->nrxx ; ++ir)`
`93`	`93`	`{`
`94`		`- this->ft.get_auxr_data<FPTYPE>()[ir] = std::complex<double>(in[ir],0);`
	`94`	`+ this->ft.get_auxr_data<FPTYPE>()[ir] = std::complex<FPTYPE>(in[ir],0);`
`95`	`95`	`}`
`96`	`96`	`this->ft.fftxyfor(ft.get_auxr_data<FPTYPE>(),ft.get_auxr_data<FPTYPE>());`
`97`	`97`	`}`
`@@ -141,7 +141,7 @@ void PW_Basis:: recip2real(const std::complex<FPTYPE> * in, std::complex<FPTYPE>`
`141`	`141`	`#endif`
`142`	`142`	`for(int i = 0 ; i < this->nst * this->nz ; ++i)`
`143`	`143`	`{`
`144`		`- ft.get_auxg_data<FPTYPE>()[i] = std::complex<double>(0, 0);`
	`144`	`+ ft.get_auxg_data<FPTYPE>()[i] = std::complex<FPTYPE>(0, 0);`
`145`	`145`	`}`
`146`	`146`
`147`	`147`	`#ifdef _OPENMP`