Reactor FFT format and add RAII for resource handler (#6156)

A-006 · Critsium-xy · mohanchen · web-flow · commit 691b2edd2b2a · 2025-05-25T20:16:05.000+08:00
* add change

* add comment

* fix resource handler

* add RALL

* update make unique

* rename FFT guard

* change compute mode

* change compute mode

* fix compute bug

* add the nullpter

* add barce

* update compile bug

---------

Co-authored-by: Critsium-xy &lt;tsfxwbbzxy@163.com&gt;
Co-authored-by: Mohan Chen &lt;mohanchen@pku.edu.cn&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -269,6 +269,7 @@ if(ENABLE_MPI)
   list(APPEND math_libs MPI::MPI_CXX)
 endif()
 
+
 if (USE_DSP)
   add_compile_definitions(__DSP)
   target_link_libraries(${ABACUS_BIN_NAME} ${OMPI_LIBRARY1})
diff --git a/source/module_basis/module_pw/module_fft/fft_base.h b/source/module_basis/module_pw/module_fft/fft_base.h
@@ -12,7 +12,7 @@ class FFT_BASE
     virtual ~FFT_BASE() {};
 
     /**
-     * @brief Initialize the fft parameters As virtual function.
+     * @brief Initialize the fft parameters as virtual function.
      *
      * The function is used to initialize the fft parameters.
      */
@@ -30,32 +30,40 @@ class FFT_BASE
     virtual __attribute__((weak)) void initfft(int nx_in, int ny_in, int nz_in);
 
     /**
-     * @brief Setup the fft Plan and data As pure virtual function.
+     * @brief Setup the fft plan and data as pure virtual function.
      *
      * The function is set as pure virtual function.In order to
      * override the function in the derived class.In the derived
-     * class, the function is used to setup the fft Plan and data.
+     * class, the function is used to setup the fft plan and data.
      */
     virtual void setupFFT() = 0;
 
     /**
-     * @brief Clean the fft Plan  As pure virtual function.
+     * @brief Clean the fft plan as pure virtual function.
      *
      * The function is set as pure virtual function.In order to
      * override the function in the derived class.In the derived
-     * class, the function is used to clean the fft Plan.
+     * class, the function is used to clean the fft plan.
      */
     virtual void cleanFFT() = 0;
 
     /**
-     * @brief Clear the fft data As pure virtual function.
+     * @brief Clear the fft data as pure virtual function.
      *
      * The function is set as pure virtual function.In order to
      * override the function in the derived class.In the derived
      * class, the function is used to clear the fft data.
      */
     virtual void clear() = 0;
-
+    /**
+     * @brief Allocate and destory the resoure in FFT running time,
+     * Now it only used in the DSP mode.
+     * 
+     * The function is set as pure virtual function.In order to
+     * override the function in the derived class.In the derived
+     * class, the function is used to allocate and destory the
+     * resoure in FFT running time.
+     */
     virtual void resource_handler(const int flag) const {};
     /**
      * @brief Get the real space data in cpu-like fft
diff --git a/source/module_basis/module_pw/module_fft/fft_bundle.cpp b/source/module_basis/module_pw/module_fft/fft_bundle.cpp
@@ -50,16 +50,22 @@ void FFT_Bundle::initfft(int nx_in,
     if (this->precision == "single" || this->precision == "mixing")
     {
         float_flag = true;
+        if (this->precision == "mixing")
+        {
+            double_flag = true;
+        }
 #if not defined(__ENABLE_FLOAT_FFTW)
         if (this->device == "cpu")
         {
             ModuleBase::WARNING_QUIT("FFT_Bundle", "Please enable float fftw in the cmake to use float fft");
         }
 #endif
     }
-    if (this->precision == "double" || this->precision == "mixing")
+    else if (this->precision == "double")
     {
         double_flag = true;
+    }else{
+        ModuleBase::WARNING_QUIT("FFT_Bundle", "Please set the precision to single or double or mixing");
     }
 #if defined(__DSP)
     if (device == "dsp")
@@ -70,24 +76,23 @@ void FFT_Bundle::initfft(int nx_in,
         }
         fft_double = make_unique<FFT_DSP<double>>();
         fft_double->initfft(nx_in, ny_in, nz_in);
-    }
+    }else
 #endif
     if (device == "cpu")
     {
-        fft_float = make_unique<FFT_CPU<float>>(this->fft_mode);
-        fft_double = make_unique<FFT_CPU<double>>(this->fft_mode);
         if (float_flag)
         {
+            fft_float = make_unique<FFT_CPU<float>>(this->fft_mode);
             fft_float
                 ->initfft(nx_in, ny_in, nz_in, lixy_in, rixy_in, ns_in, nplane_in, nproc_in, gamma_only_in, xprime_in);
         }
         if (double_flag)
         {
+            fft_double = make_unique<FFT_CPU<double>>(this->fft_mode);
             fft_double
                 ->initfft(nx_in, ny_in, nz_in, lixy_in, rixy_in, ns_in, nplane_in, nproc_in, gamma_only_in, xprime_in);
         }
-    }
-    if (device == "gpu")
+    }else if (device == "gpu")
     {
 #if defined(__ROCM)
         fft_float = make_unique<FFT_ROCM<float>>();
@@ -100,6 +105,8 @@ void FFT_Bundle::initfft(int nx_in,
         fft_double = make_unique<FFT_CUDA<double>>();
         fft_double->initfft(nx_in, ny_in, nz_in);
 #endif
+    }else{
+        ModuleBase::WARNING_QUIT("FFT_Bundle", "Please set the device to cpu or gpu or dsp");
     }
 }
 
diff --git a/source/module_basis/module_pw/module_fft/fft_bundle.h b/source/module_basis/module_pw/module_fft/fft_bundle.h
@@ -203,5 +203,18 @@ class FFT_Bundle
     std::string device = "cpu";
     std::string precision = "double";
 };
+// Use RAII (Resource Acquisition Is Initialization) to 
+// control the resources used by hthread when setting the DSP
+struct FFT_Guard
+  {
+      const FFT_Bundle& fft_;
+      FFT_Guard(const FFT_Bundle& fft) : fft_(fft) 
+        {fft_.resource_handler(1);}
+      ~FFT_Guard()
+      {
+        fft_.resource_handler(0);
+      }
+  };
+
 } // namespace ModulePW
 #endif // FFT_H
diff --git a/source/module_basis/module_pw/module_fft/fft_dsp.cpp b/source/module_basis/module_pw/module_fft/fft_dsp.cpp
@@ -63,7 +63,7 @@ void FFT_DSP<double>::setupFFT()
 template <>
 void FFT_DSP<double>::resource_handler(const int flag) const
 {
-    if (flag==0)
+    if (flag == 0)
     {
         hthread_barrier_destroy(b_id);
         hthread_group_destroy(thread_id_for);
@@ -76,6 +76,8 @@ void FFT_DSP<double>::resource_handler(const int flag) const
         b_id = hthread_barrier_create(cluster_id);
         args_for[0] = b_id;
         args_back[0] = b_id;
+    }else{
+        ModuleBase::WARNING_QUIT("FFT_DSP", "Error use of fft resource handle");
     }
 }
 template <>
diff --git a/source/module_basis/module_pw/module_fft/fft_dsp.h b/source/module_basis/module_pw/module_fft/fft_dsp.h
@@ -12,6 +12,7 @@
 
 namespace ModulePW
 {
+    
 template <typename FPTYPE>
 class FFT_DSP : public FFT_BASE<FPTYPE>
 {
@@ -24,7 +25,12 @@ class FFT_DSP : public FFT_BASE<FPTYPE>
         void clear() override;
 
         void cleanFFT() override;
-
+        /**
+         * @brief Control the allocation or deallocation of hthread 
+         * resource 
+         * @param flag  0: deallocate, 1: allocate
+         */
+        void resource_handler(const int flag) const override;
         /** 
         * @brief Initialize the fft parameters
         * @param nx_in  number of grid points in x direction
diff --git a/source/module_basis/module_pw/pw_basis_k.h b/source/module_basis/module_pw/pw_basis_k.h
@@ -187,7 +187,7 @@ class PW_Basis_K : public PW_Basis
                        const typename GetTypeReal<TK>::type factor = 1.0) const
     {
       #if defined(__DSP)
-        this->recip2real_dsp(in, out, ik, add, factor);
+        this->real2recip_dsp(in, out, ik, add, factor);
       #else
         this->real2recip(in,out,ik,add,factor);
       #endif
diff --git a/source/module_basis/module_pw/pw_transform_k_dsp.cpp b/source/module_basis/module_pw/pw_transform_k_dsp.cpp
@@ -8,12 +8,30 @@
 #if defined (__DSP)
 namespace ModulePW
 {
-template <typename FPTYPE>
-void PW_Basis_K::real2recip_dsp(const std::complex<FPTYPE>* in,
-                                std::complex<FPTYPE>* out,
+    template <>
+void PW_Basis_K::real2recip_dsp(const std::complex<float>* in,
+                                std::complex<float>* out,
                                 const int ik,
                                 const bool add,
-                                const FPTYPE factor) const
+                                const float factor) const
+                                {
+
+                                }
+    template <>
+void PW_Basis_K::recip2real_dsp(const std::complex<float>* in,
+                                std::complex<float>* out,
+                                const int ik,
+                                const bool add,
+                                const float factor) const
+                                {
+
+                                }
+template <>
+void PW_Basis_K::real2recip_dsp(const std::complex<double>* in,
+                                std::complex<double>* out,
+                                const int ik,
+                                const bool add,
+                                const double factor) const
 {
     const base_device::DEVICE_CPU* ctx;
     const base_device::DEVICE_GPU* gpux;
@@ -31,20 +49,20 @@ void PW_Basis_K::real2recip_dsp(const std::complex<FPTYPE>* in,
                                    auxr);
     this->fft_bundle.resource_handler(0);
     // copy the result from the auxr to the out ,while consider the add
-    set_real_to_recip_output_op<FPTYPE, base_device::DEVICE_CPU>()(npw_k,
+    set_real_to_recip_output_op<double, base_device::DEVICE_CPU>()(npw_k,
                                                                    this->nxyz,
                                                                    add,
                                                                    factor,
                                                                    this->ig2ixyz_k_cpu.data() + startig,
                                                                    auxr,
                                                                    out);
 }
-template <typename FPTYPE>
-void PW_Basis_K::recip2real_dsp(const std::complex<FPTYPE>* in,
-                                std::complex<FPTYPE>* out,
+template <>
+void PW_Basis_K::recip2real_dsp(const std::complex<double>* in,
+                                std::complex<double>* out,
                                 const int ik,
                                 const bool add,
-                                const FPTYPE factor) const
+                                const double factor) const
 {
     assert(this->gamma_only == false);
     const base_device::DEVICE_CPU* ctx;
@@ -128,16 +146,16 @@ void PW_Basis_K::convolution(const base_device::DEVICE_CPU* ctx,
     ModuleBase::timer::tick(this->classname, "convolution");
 }
 
-// template void PW_Basis_K::real2recip_dsp<float>(const std::complex<float>* in,
-//                                             std::complex<float>* out,
-//                                             const int ik,
-//                                             const bool add,
-//                                             const float factor) const; // in:(nplane,nx*ny)  ; out(nz, ns)
-// template void PW_Basis_K::recip2real_dsp<float>(const std::complex<float>* in,
-//                                             std::complex<float>* out,
-//                                             const int ik,
-//                                             const bool add,
-//                                             const float factor) const; // in:(nz, ns)  ; out(nplane,nx*ny)
+template void PW_Basis_K::real2recip_dsp<float>(const std::complex<float>* in,
+                                            std::complex<float>* out,
+                                            const int ik,
+                                            const bool add,
+                                            const float factor) const; // in:(nplane,nx*ny)  ; out(nz, ns)
+template void PW_Basis_K::recip2real_dsp<float>(const std::complex<float>* in,
+                                            std::complex<float>* out,
+                                            const int ik,
+                                            const bool add,
+                                            const float factor) const; // in:(nz, ns)  ; out(nplane,nx*ny)
 
 template void PW_Basis_K::real2recip_dsp<double>(const std::complex<double>* in,
                                                  std::complex<double>* out,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/veff_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/veff_pw.cpp
@@ -52,52 +52,82 @@ void Veff<OperatorPW<T, Device>>::act(
     {
         setmem_complex_op()(tmhpsi, 0, nbasis*nbands/npol);
     }
-
     int max_npw = nbasis / npol;
     const int current_spin = this->isk[this->ik];
-
+    const int psi_offset= max_npw * npol;
 #ifdef __DSP
-    wfcpw->fft_bundle.resource_handler(1);
-#endif
-
-    for (int ib = 0; ib < nbands; ib += npol)
+    if (npol == 1)
+    {
+        ModulePW::FFT_Guard guard(wfcpw->fft_bundle);
+        for (int ib = 0; ib < nbands; ib += npol)
+        {
+            wfcpw->convolution(this->ctx,
+                               this->ik,
+                               this->veff_col,
+                               tmpsi_in,
+                               this->veff + current_spin * this->veff_col,
+                               tmhpsi,
+                               true);
+            tmhpsi   += psi_offset;
+            tmpsi_in += psi_offset;
+        }
+    }else if (npol == 2)
+    {
+        const Real* current_veff[4]={nullptr};
+        for (int is = 0; is < 4; is++)
+        {
+            current_veff[is] = this->veff + is * this->veff_col;
+        }
+        for (int ib = 0; ib < nbands; ib += npol)
+        {
+            wfcpw->recip_to_real<T, Device>(tmpsi_in, this->porter, this->ik);
+            wfcpw->recip_to_real<T, Device>(tmpsi_in + max_npw, this->porter1, this->ik);
+            veff_op()(this->ctx, this->veff_col, this->porter, this->porter1, current_veff);
+            wfcpw->real_to_recip<T, Device>(this->porter, tmhpsi, this->ik, true);
+            wfcpw->real_to_recip<T, Device>(this->porter1, tmhpsi + max_npw, this->ik, true);
+            tmhpsi   += psi_offset;
+            tmpsi_in += psi_offset;
+        }
+    }else{
+        ModuleBase::WARNING_QUIT("VeffPW", "npol should be 1 or 2 or veff_col equal to 0\n");
+    }
+#else
+    if (npol == 1)
     {
-        if (npol == 1)
+        for (int ib = 0; ib < nbands; ib += npol)
         {
-            wfcpw->recip_to_real<T,Device>(tmpsi_in, this->porter, this->ik);
+            wfcpw->recip_to_real<T, Device>(tmpsi_in, this->porter, this->ik);
             // NOTICE: when MPI threads are larger than the number of Z grids
             // veff would contain nothing, and nothing should be done in real space
             // but the 3DFFT can not be skipped, it will cause hanging
-            if(this->veff_col != 0)
-            {
-                veff_op()(this->ctx, this->veff_col, this->porter, this->veff + current_spin * this->veff_col);
-            }
-            wfcpw->real_to_recip<T,Device>(this->porter, tmhpsi, this->ik, true);
+            veff_op()(this->ctx, this->veff_col, this->porter, this->veff + current_spin * this->veff_col);
+            wfcpw->real_to_recip<T, Device>(this->porter, tmhpsi, this->ik, true);
+            tmhpsi   += psi_offset;
+            tmpsi_in += psi_offset;
         }
-        else
+    }
+    else if (npol == 2)
+    {
+        const Real* current_veff[4]={nullptr};
+        for (int is = 0; is < 4; is++)
+        {
+            current_veff[is] = this->veff + is * this->veff_col;
+        }
+        for (int ib = 0; ib < nbands; ib += npol)
         {
             // FFT to real space and do things.
-            wfcpw->recip_to_real<T,Device>(tmpsi_in, this->porter, this->ik);
-            wfcpw->recip_to_real<T,Device>(tmpsi_in + max_npw, this->porter1, this->ik);
-            if(this->veff_col != 0)
-            {
-                /// denghui added at 20221109
-				const Real* current_veff[4];
-				for(int is = 0; is < 4; is++) 
-				{
-					current_veff[is] = this->veff + is * this->veff_col ; // for CPU device
-				}
-                veff_op()(this->ctx, this->veff_col, this->porter, this->porter1, current_veff);
-            }
+            wfcpw->recip_to_real<T, Device>(tmpsi_in, this->porter, this->ik);
+            wfcpw->recip_to_real<T, Device>(tmpsi_in + max_npw, this->porter1, this->ik);
+            veff_op()(this->ctx, this->veff_col, this->porter, this->porter1, current_veff);
             // FFT back to G space.
-            wfcpw->real_to_recip<T,Device>(this->porter, tmhpsi, this->ik, true);
-            wfcpw->real_to_recip<T,Device>(this->porter1, tmhpsi + max_npw, this->ik, true);
+            wfcpw->real_to_recip<T, Device>(this->porter, tmhpsi, this->ik, true);
+            wfcpw->real_to_recip<T, Device>(this->porter1, tmhpsi + max_npw, this->ik, true);
+            tmhpsi   += psi_offset;
+            tmpsi_in += psi_offset;
         }
-        tmhpsi += max_npw * npol;
-        tmpsi_in += max_npw * npol;
+    }else{
+        ModuleBase::WARNING_QUIT("VeffPW", "npol should be 1 or 2 or veff_col equal to 0\n");
     }
-#ifdef __DSP
-    wfcpw->fft_bundle.resource_handler(0);
 #endif
     ModuleBase::timer::tick("Operator", "veff_pw");
 }

Original file line number	Diff line number	Diff line change
`@@ -50,16 +50,22 @@ void FFT_Bundle::initfft(int nx_in,`
`50`	`50`	`if (this->precision == "single" \|\| this->precision == "mixing")`
`51`	`51`	`{`
`52`	`52`	`float_flag = true;`
	`53`	`+ if (this->precision == "mixing")`
	`54`	`+ {`
	`55`	`+ double_flag = true;`
	`56`	`+ }`
`53`	`57`	`#if not defined(__ENABLE_FLOAT_FFTW)`
`54`	`58`	`if (this->device == "cpu")`
`55`	`59`	`{`
`56`	`60`	`ModuleBase::WARNING_QUIT("FFT_Bundle", "Please enable float fftw in the cmake to use float fft");`
`57`	`61`	`}`
`58`	`62`	`#endif`
`59`	`63`	`}`
`60`		`- if (this->precision == "double" \|\| this->precision == "mixing")`
	`64`	`+ else if (this->precision == "double")`
`61`	`65`	`{`
`62`	`66`	`double_flag = true;`
	`67`	`+ }else{`
	`68`	`+ ModuleBase::WARNING_QUIT("FFT_Bundle", "Please set the precision to single or double or mixing");`
`63`	`69`	`}`
`64`	`70`	`#if defined(__DSP)`
`65`	`71`	`if (device == "dsp")`
`@@ -70,24 +76,23 @@ void FFT_Bundle::initfft(int nx_in,`
`70`	`76`	`}`
`71`	`77`	`fft_double = make_unique<FFT_DSP<double>>();`
`72`	`78`	`fft_double->initfft(nx_in, ny_in, nz_in);`
`73`		`- }`
	`79`	`+ }else`
`74`	`80`	`#endif`
`75`	`81`	`if (device == "cpu")`
`76`	`82`	`{`
`77`		`- fft_float = make_unique<FFT_CPU<float>>(this->fft_mode);`
`78`		`- fft_double = make_unique<FFT_CPU<double>>(this->fft_mode);`
`79`	`83`	`if (float_flag)`
`80`	`84`	`{`
	`85`	`+ fft_float = make_unique<FFT_CPU<float>>(this->fft_mode);`
`81`	`86`	`fft_float`
`82`	`87`	`->initfft(nx_in, ny_in, nz_in, lixy_in, rixy_in, ns_in, nplane_in, nproc_in, gamma_only_in, xprime_in);`
`83`	`88`	`}`
`84`	`89`	`if (double_flag)`
`85`	`90`	`{`
	`91`	`+ fft_double = make_unique<FFT_CPU<double>>(this->fft_mode);`
`86`	`92`	`fft_double`
`87`	`93`	`->initfft(nx_in, ny_in, nz_in, lixy_in, rixy_in, ns_in, nplane_in, nproc_in, gamma_only_in, xprime_in);`
`88`	`94`	`}`
`89`		`- }`
`90`		`- if (device == "gpu")`
	`95`	`+ }else if (device == "gpu")`
`91`	`96`	`{`
`92`	`97`	`#if defined(__ROCM)`
`93`	`98`	`fft_float = make_unique<FFT_ROCM<float>>();`
`@@ -100,6 +105,8 @@ void FFT_Bundle::initfft(int nx_in,`
`100`	`105`	`fft_double = make_unique<FFT_CUDA<double>>();`
`101`	`106`	`fft_double->initfft(nx_in, ny_in, nz_in);`
`102`	`107`	`#endif`
	`108`	`+ }else{`
	`109`	`+ ModuleBase::WARNING_QUIT("FFT_Bundle", "Please set the device to cpu or gpu or dsp");`
`103`	`110`	`}`
`104`	`111`	`}`
`105`	`112`
Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,7 @@ void FFT_DSP<double>::setupFFT()`
`63`	`63`	`template <>`
`64`	`64`	`void FFT_DSP<double>::resource_handler(const int flag) const`
`65`	`65`	`{`
`66`		`- if (flag==0)`
	`66`	`+ if (flag == 0)`
`67`	`67`	`{`
`68`	`68`	`hthread_barrier_destroy(b_id);`
`69`	`69`	`hthread_group_destroy(thread_id_for);`
`@@ -76,6 +76,8 @@ void FFT_DSP<double>::resource_handler(const int flag) const`
`76`	`76`	`b_id = hthread_barrier_create(cluster_id);`
`77`	`77`	`args_for[0] = b_id;`
`78`	`78`	`args_back[0] = b_id;`
	`79`	`+ }else{`
	`80`	`+ ModuleBase::WARNING_QUIT("FFT_DSP", "Error use of fft resource handle");`
`79`	`81`	`}`
`80`	`82`	`}`
`81`	`83`	`template <>`
Original file line number	Diff line number	Diff line change
`@@ -187,7 +187,7 @@ class PW_Basis_K : public PW_Basis`
`187`	`187`	`const typename GetTypeReal<TK>::type factor = 1.0) const`
`188`	`188`	`{`
`189`	`189`	`#if defined(__DSP)`
`190`		`- this->recip2real_dsp(in, out, ik, add, factor);`
	`190`	`+ this->real2recip_dsp(in, out, ik, add, factor);`
`191`	`191`	`#else`
`192`	`192`	`this->real2recip(in,out,ik,add,factor);`
`193`	`193`	`#endif`