deepmodeling
diff --git a/‎docs/advanced/input_files/input-main.md‎
Lines changed: 4 additions & 7 deletions b/‎docs/advanced/input_files/input-main.md‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎source/module_base/module_device/memory_op.h‎
Lines changed: 1 addition & 0 deletions b/‎source/module_base/module_device/memory_op.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎source/module_base/tool_quit.cpp‎
Lines changed: 14 additions & 0 deletions b/‎source/module_base/tool_quit.cpp‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎source/module_basis/module_pw/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎source/module_basis/module_pw/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎source/module_basis/module_pw/module_fft/fft_cuda.cpp‎
Lines changed: 26 additions & 26 deletions b/‎source/module_basis/module_pw/module_fft/fft_cuda.cpp‎
Lines changed: 26 additions & 26 deletions
diff --git a/‎source/module_basis/module_pw/pw_basis.cpp‎
Lines changed: 2 additions & 0 deletions b/‎source/module_basis/module_pw/pw_basis.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎source/module_basis/module_pw/pw_basis.h‎
Lines changed: 13 additions & 8 deletions b/‎source/module_basis/module_pw/pw_basis.h‎
Lines changed: 13 additions & 8 deletions
diff --git a/‎source/module_basis/module_pw/pw_basis_big.h‎
Lines changed: 0 additions & 1 deletion b/‎source/module_basis/module_pw/pw_basis_big.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎source/module_basis/module_pw/pw_basis_k_big.h‎
Lines changed: 1 addition & 2 deletions b/‎source/module_basis/module_pw/pw_basis_k_big.h‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎source/module_basis/module_pw/pw_basis_sup.cpp‎
Lines changed: 13 additions & 13 deletions b/‎source/module_basis/module_pw/pw_basis_sup.cpp‎
Lines changed: 13 additions & 13 deletions
@@ -52,7 +52,6 @@
     - [lcao\_dr](#lcao_dr)
     - [lcao\_rmax](#lcao_rmax)
     - [search\_radius](#search_radius)
-    - [search\_pbc](#search_pbc)
     - [bx, by, bz](#bx-by-bz)
     - [elpa\_num\_thread](#elpa_num_thread)
     - [num\_stream](#num_stream)
@@ -924,12 +923,6 @@ These variables are used to control the numerical atomic orbitals related parame
 - **Default**: -1
 - **Unit**: Bohr
 
-### search_pbc
-
-- **Type**: Boolean
-- **Description**: If True, periodic images will be included in searching for the neighbouring atoms. If False, periodic images will be ignored.
-- **Default**: True
-
 ### bx, by, bz
 
 - **Type**: Integer
@@ -3586,19 +3579,23 @@ These variables are used to control berry phase and wannier90 interface paramete
 - **Type**: Real
 - **Description**:
   `td_lcut1` is the lower bound of the interval in the length gauge RT-TDDFT, where $x$ is the fractional coordinate:
+
   $$
     E(x)=\begin{cases}E_0, & \mathtt{cut1}\leqslant x \leqslant \mathtt{cut2} \\-E_0\left(\dfrac{1}{\mathtt{cut1}+1-\mathtt{cut2}}-1\right), & 0 < x < \mathtt{cut1~~or~~cut2} < x < 1 \end{cases}
   $$
+
 - **Default**: 0.05
 
 ### td_lcut2
 
 - **Type**: Real
 - **Description**:
   `td_lcut2` is the upper bound of the interval in the length gauge RT-TDDFT, where $x$ is the fractional coordinate:
+
   $$
     E(x)=\begin{cases}E_0, & \mathtt{cut1}\leqslant x \leqslant \mathtt{cut2} \\-E_0\left(\dfrac{1}{\mathtt{cut1}+1-\mathtt{cut2}}-1\right), & 0 < x < \mathtt{cut1~~or~~cut2} < x < 1 \end{cases}
   $$
+
 - **Default**: 0.95
 
 ### td_gauss_freq
 
@@ -133,6 +133,7 @@ struct synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVIC
     void operator()(FPTYPE* arr_out,
                     const FPTYPE* arr_in,
                     const size_t size);
+    
 };
 
 template <typename FPTYPE>
 
@@ -62,6 +62,20 @@ void QUIT(int ret)
 void WARNING_QUIT(const std::string &file,const std::string &description)
 {
 	WARNING_QUIT(file, description, 1);
+
+	#ifdef __MPI /* if it is MPI run, finalize first, then exit */
+	std::cout << "Detecting if MPI has been initialized..." << std::endl;
+	int is_initialized;
+    MPI_Initialized(&is_initialized);
+	if (is_initialized) {
+		std::cout << "Terminating ABACUS with multiprocessing environment." << std::endl;
+		MPI_Finalize();
+	}
+	else{
+		std::cout << "MPI has not been initialized. Quit normally." << std::endl;
+	}
+	/* but seems this is the only correct way to terminate the MPI */
+#endif
 }
 
 void WARNING_QUIT(const std::string &file,const std::string &description,int ret)
 
@@ -58,5 +58,6 @@ if(BUILD_TESTING)
     add_subdirectory(test)
     add_subdirectory(test_serial)
     add_subdirectory(kernels/test)
+    add_subdirectory(test_gpu)
   endif()
 endif()
@@ -1,13 +1,12 @@
 #include "fft_cuda.h"
+
 #include "module_base/module_device/memory_op.h"
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
 
 namespace ModulePW
 {
 template <typename FPTYPE>
-void FFT_CUDA<FPTYPE>::initfft(int nx_in, 
-                               int ny_in, 
-                               int nz_in)
+void FFT_CUDA<FPTYPE>::initfft(int nx_in, int ny_in, int nz_in)
 {
     this->nx = nx_in;
     this->ny = ny_in;
@@ -18,9 +17,8 @@ void FFT_CUDA<float>::setupFFT()
 {
     cufftPlan3d(&c_handle, this->nx, this->ny, this->nz, CUFFT_C2C);
     resmem_cd_op()(this->c_auxr_3d, this->nx * this->ny * this->nz);
-        
 }
-template <>  
+template <>
 void FFT_CUDA<double>::setupFFT()
 {
     cufftPlan3d(&z_handle, this->nx, this->ny, this->nz, CUFFT_Z2Z);
@@ -66,49 +64,51 @@ void FFT_CUDA<double>::clear()
 }
 
 template <>
-void FFT_CUDA<float>::fft3D_forward(std::complex<float>* in, 
-                                    std::complex<float>* out) const
+void FFT_CUDA<float>::fft3D_forward(std::complex<float>* in, std::complex<float>* out) const
 {
-    CHECK_CUFFT(cufftExecC2C(this->c_handle, 
-                             reinterpret_cast<cufftComplex*>(in), 
+    CHECK_CUFFT(cufftExecC2C(this->c_handle,
+                             reinterpret_cast<cufftComplex*>(in),
                              reinterpret_cast<cufftComplex*>(out),
                              CUFFT_FORWARD));
 }
 template <>
-void FFT_CUDA<double>::fft3D_forward(std::complex<double>* in, 
-                                     std::complex<double>* out) const
+void FFT_CUDA<double>::fft3D_forward(std::complex<double>* in, std::complex<double>* out) const
 {
-    CHECK_CUFFT(cufftExecZ2Z(this->z_handle, 
+    CHECK_CUFFT(cufftExecZ2Z(this->z_handle,
                              reinterpret_cast<cufftDoubleComplex*>(in),
-                             reinterpret_cast<cufftDoubleComplex*>(out), 
+                             reinterpret_cast<cufftDoubleComplex*>(out),
                              CUFFT_FORWARD));
 }
 template <>
-void FFT_CUDA<float>::fft3D_backward(std::complex<float>* in, 
-                                     std::complex<float>* out) const
+void FFT_CUDA<float>::fft3D_backward(std::complex<float>* in, std::complex<float>* out) const
 {
-    CHECK_CUFFT(cufftExecC2C(this->c_handle, 
-                             reinterpret_cast<cufftComplex*>(in), 
+    CHECK_CUFFT(cufftExecC2C(this->c_handle,
+                             reinterpret_cast<cufftComplex*>(in),
                              reinterpret_cast<cufftComplex*>(out),
                              CUFFT_INVERSE));
 }
 
 template <>
-void FFT_CUDA<double>::fft3D_backward(std::complex<double>* in, 
-                                      std::complex<double>* out) const
+void FFT_CUDA<double>::fft3D_backward(std::complex<double>* in, std::complex<double>* out) const
 {
-    CHECK_CUFFT(cufftExecZ2Z(this->z_handle, 
+    CHECK_CUFFT(cufftExecZ2Z(this->z_handle,
                              reinterpret_cast<cufftDoubleComplex*>(in),
-                             reinterpret_cast<cufftDoubleComplex*>(out), 
+                             reinterpret_cast<cufftDoubleComplex*>(out),
                              CUFFT_INVERSE));
 }
-template <> std::complex<float>* 
-FFT_CUDA<float>::get_auxr_3d_data()  const {return this->c_auxr_3d;}
-template <> std::complex<double>* 
-FFT_CUDA<double>::get_auxr_3d_data() const {return this->z_auxr_3d;}
+template <>
+std::complex<float>* FFT_CUDA<float>::get_auxr_3d_data() const
+{
+    return this->c_auxr_3d;
+}
+template <>
+std::complex<double>* FFT_CUDA<double>::get_auxr_3d_data() const
+{
+    return this->z_auxr_3d;
+}
 
 template FFT_CUDA<float>::FFT_CUDA();
 template FFT_CUDA<float>::~FFT_CUDA();
 template FFT_CUDA<double>::FFT_CUDA();
 template FFT_CUDA<double>::~FFT_CUDA();
-}// namespace ModulePW
+} // namespace ModulePW
@@ -43,6 +43,7 @@ PW_Basis:: ~PW_Basis()
     if (this->device == "gpu")
     {
         delmem_int_op()(this->d_is2fftixy);
+        delmem_int_op()(this->ig2ixyz_gpu);
     }
 #endif
 }
@@ -59,6 +60,7 @@ void PW_Basis::setuptransform()
     this->distribute_g();
     this->getstartgr();
     this->fft_bundle.clear();
+    
     if(this->xprime)    
     {
         this->fft_bundle.initfft(this->nx,this->ny,this->nz,this->lix,this->rix,this->nst,this->nplane,this->poolnproc,this->gamma_only, this->xprime);
 
@@ -100,10 +100,11 @@ class PW_Basis
 //===============================================
 public:
 #ifdef __MPI
-    MPI_Comm pool_world;
+    MPI_Comm pool_world=MPI_COMM_NULL;
 #endif
 
     int *ig2isz=nullptr; // map ig to (is, iz).
+    int *ig2ixyz_gpu = nullptr;
     int *istot2ixy=nullptr; // istot2ixy[is]: iy + ix * ny of is^th stick among all sticks.
     int *is2fftixy=nullptr, * d_is2fftixy = nullptr; // is2fftixy[is]: iy + ix * ny of is^th stick among sticks on current proc.
     int *fftixy2ip=nullptr; // fftixy2ip[iy + ix * fftny]: ip of proc which contains stick on (ix, iy). if no stick: -1
@@ -352,7 +353,10 @@ class PW_Basis
     void recip_to_real(TK* in,
                        TR* out,
                        const bool add = false,
-                       const typename GetTypeReal<TK>::type factor = 1.0) const;
+                       const typename GetTypeReal<TK>::type factor = 1.0) const
+                       {
+                        this->recip2real_gpu(in,out,add,factor);
+                       };
 
     // template <typename FPTYPE,
     //         typename Device,
@@ -383,9 +387,7 @@ class PW_Basis
      * values in the output array.
      * @param factor Optional scaling factor, default value 1.0, applied to the output values.
      */
-    template <typename TK,
-            typename TR,
-            typename Device,
+    template <typename TR,typename TK,typename Device,
             typename std::enable_if<!std::is_same<TK, typename GetTypeReal<TK>::type>::value
                     && (std::is_same<TR, typename GetTypeReal<TK>::type>::value || std::is_same<TR, TK>::value)
                     && std::is_same<Device, base_device::DEVICE_CPU>::value ,int>::type = 0>
@@ -397,14 +399,17 @@ class PW_Basis
         this->real2recip(in, out, add, factor);
     }
 
-    template <typename TK,typename TR, typename Device,
+    template <typename TR, typename TK, typename Device,
             typename std::enable_if<!std::is_same<TK, typename GetTypeReal<TK>::type>::value
                     && (std::is_same<TR, typename GetTypeReal<TK>::type>::value || std::is_same<TR, TK>::value)
-                    && !std::is_same<Device, base_device::DEVICE_CPU>::value ,int>::type = 0>
+                    && std::is_same<Device, base_device::DEVICE_GPU>::value ,int>::type = 0>
     void real_to_recip(TR* in,
                        TK* out,
                        const bool add = false,
-                       const typename GetTypeReal<TK>::type factor = 1.0) const;
+                       const typename GetTypeReal<TK>::type factor = 1.0) const
+                       {
+                        this->real2recip_gpu(in,out,add,factor);
+                       };
 
   protected:
     //gather planes and scatter sticks of all processors
 
@@ -170,7 +170,6 @@ class PW_Basis_Big : public PW_Basis_Sup
     MPI_Allreduce(MPI_IN_PLACE, ibox, 3, MPI_INT, MPI_MAX , this->pool_world);
 #endif
 
-
     // Find the minimal FFT box size the factors into the primes (2,3,5,7).
     for (int i = 0; i < 3; i++)
     {
 
@@ -56,8 +56,7 @@ class PW_Basis_K_Big: public PW_Basis_K
         for(int ip = 0 ; ip < this->poolnproc ; ++ip)
         {
             this->numz[ip] = npbz*this->bz;
-            if(ip < modbz) {   this->numz[ip]+=this->bz;
-}
+            if(ip < modbz) {   this->numz[ip]+=this->bz;}
             if(ip < this->poolnproc - 1)   this->startz[ip+1] = this->startz[ip] + numz[ip];
             if(ip == this->poolrank) 
             {
 
@@ -114,12 +114,12 @@ void PW_Basis_Sup::distribution_method3(const ModulePW::PW_Basis* pw_rho)
         this->count_pw_st(st_length2D, st_bottom2D);
     }
 #ifdef __MPI
-    MPI_Bcast(&this->npwtot, 1, MPI_INT, 0, this->pool_world);
-    MPI_Bcast(&this->nstot, 1, MPI_INT, 0, this->pool_world);
-    MPI_Bcast(&liy, 1, MPI_INT, 0, this->pool_world);
-    MPI_Bcast(&riy, 1, MPI_INT, 0, this->pool_world);
-    MPI_Bcast(&lix, 1, MPI_INT, 0, this->pool_world);
-    MPI_Bcast(&rix, 1, MPI_INT, 0, this->pool_world);
+        MPI_Bcast(&this->npwtot, 1, MPI_INT, 0, this->pool_world);
+        MPI_Bcast(&this->nstot, 1, MPI_INT, 0, this->pool_world);
+        MPI_Bcast(&liy, 1, MPI_INT, 0, this->pool_world);
+        MPI_Bcast(&riy, 1, MPI_INT, 0, this->pool_world);
+        MPI_Bcast(&lix, 1, MPI_INT, 0, this->pool_world);
+        MPI_Bcast(&rix, 1, MPI_INT, 0, this->pool_world);
 #endif
     delete[] this->istot2ixy;
     this->istot2ixy = new int[this->nstot];
@@ -164,14 +164,14 @@ void PW_Basis_Sup::distribution_method3(const ModulePW::PW_Basis* pw_rho)
         }
 #endif
     }
-
 #ifdef __MPI
-    MPI_Bcast(st_length2D, this->fftnxy, MPI_INT, 0, this->pool_world);
-    MPI_Bcast(st_bottom2D, this->fftnxy, MPI_INT, 0, this->pool_world);
-    MPI_Bcast(this->fftixy2ip, this->fftnxy, MPI_INT, 0, this->pool_world);
-    MPI_Bcast(this->istot2ixy, this->nstot, MPI_INT, 0, this->pool_world);
-    MPI_Bcast(this->nst_per, this->poolnproc, MPI_INT, 0, this->pool_world);
-    MPI_Bcast(this->npw_per, this->poolnproc, MPI_INT, 0, this->pool_world);
+   
+        MPI_Bcast(st_length2D, this->fftnxy, MPI_INT, 0, this->pool_world);
+        MPI_Bcast(st_bottom2D, this->fftnxy, MPI_INT, 0, this->pool_world);
+        MPI_Bcast(this->fftixy2ip, this->fftnxy, MPI_INT, 0, this->pool_world);
+        MPI_Bcast(this->istot2ixy, this->nstot, MPI_INT, 0, this->pool_world);
+        MPI_Bcast(this->nst_per, this->poolnproc, MPI_INT, 0, this->pool_world);
+        MPI_Bcast(this->npw_per, this->poolnproc, MPI_INT, 0, this->pool_world);
 #endif
     this->npw = this->npw_per[this->poolrank];
     this->nst = this->nst_per[this->poolrank];
Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@ PW_Basis:: ~PW_Basis()`
`43`	`43`	`if (this->device == "gpu")`
`44`	`44`	`{`
`45`	`45`	`delmem_int_op()(this->d_is2fftixy);`
	`46`	`+ delmem_int_op()(this->ig2ixyz_gpu);`
`46`	`47`	`}`
`47`	`48`	`#endif`
`48`	`49`	`}`
`@@ -59,6 +60,7 @@ void PW_Basis::setuptransform()`
`59`	`60`	`this->distribute_g();`
`60`	`61`	`this->getstartgr();`
`61`	`62`	`this->fft_bundle.clear();`
	`63`	`+`
`62`	`64`	`if(this->xprime)`
`63`	`65`	`{`
`64`	`66`	`this->fft_bundle.initfft(this->nx,this->ny,this->nz,this->lix,this->rix,this->nst,this->nplane,this->poolnproc,this->gamma_only, this->xprime);`
Original file line number	Diff line number	Diff line change
`@@ -170,7 +170,6 @@ class PW_Basis_Big : public PW_Basis_Sup`
`170`	`170`	`MPI_Allreduce(MPI_IN_PLACE, ibox, 3, MPI_INT, MPI_MAX , this->pool_world);`
`171`	`171`	`#endif`
`172`	`172`
`173`		`-`
`174`	`173`	`// Find the minimal FFT box size the factors into the primes (2,3,5,7).`
`175`	`174`	`for (int i = 0; i < 3; i++)`
`176`	`175`	`{`
Original file line number	Diff line number	Diff line change
`@@ -56,8 +56,7 @@ class PW_Basis_K_Big: public PW_Basis_K`
`56`	`56`	`for(int ip = 0 ; ip < this->poolnproc ; ++ip)`
`57`	`57`	`{`
`58`	`58`	`this->numz[ip] = npbz*this->bz;`
`59`		`- if(ip < modbz) { this->numz[ip]+=this->bz;`
`60`		`-}`
	`59`	`+ if(ip < modbz) { this->numz[ip]+=this->bz;}`
`61`	`60`	`if(ip < this->poolnproc - 1) this->startz[ip+1] = this->startz[ip] + numz[ip];`
`62`	`61`	`if(ip == this->poolrank)`
`63`	`62`	`{`