Perf: add cuda support for cal_force_loc (#6427)

dzzz2001 · web-flow · commit acaa1fc23ae1 · 2025-08-04T11:08:39.000+08:00
* add cuda support for cal_force_loc

* fix compilation bug

* fix bug
diff --git a/source/source_pw/module_pwdft/forces.cpp b/source/source_pw/module_pwdft/forces.cpp
@@ -329,7 +329,7 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
 {
     ModuleBase::TITLE("Forces", "cal_force_loc");
     ModuleBase::timer::tick("Forces", "cal_force_loc");
-
+    this->device = base_device::get_device_type<Device>(this->ctx);
     std::complex<double>* aux = new std::complex<double>[rho_basis->nmaxgr];
     // now, in all pools , the charge are the same,
     // so, the force calculated by each pool is equal.
@@ -368,30 +368,105 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     // to G space. maybe need fftw with OpenMP
     rho_basis->real2recip(aux, aux);
 
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-    for (int iat = 0; iat < this->nat; ++iat)
+    std::vector<double> tau_h;
+    std::vector<double> gcar_h;
+    if(this->device == base_device::GpuDevice)
     {
-        // read `it` `ia` from the table
-        int it = ucell.iat2it[iat];
-        int ia = ucell.iat2ia[iat];
-        for (int ig = 0; ig < rho_basis->npw; ig++)
+        tau_h.resize(this->nat * 3);
+        for(int iat = 0; iat < this->nat; ++iat)
         {
-            const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]);
-            double sinp, cosp;
-            ModuleBase::libm::sincos(phase, &sinp, &cosp);
-            const double factor
-                = vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real());
-            forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor;
-            forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor;
-            forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor;
+            int it = ucell.iat2it[iat];
+            int ia = ucell.iat2ia[iat];
+            tau_h[iat * 3] = ucell.atoms[it].tau[ia].x;
+            tau_h[iat * 3 + 1] = ucell.atoms[it].tau[ia].y;
+            tau_h[iat * 3 + 2] = ucell.atoms[it].tau[ia].z;
         }
-        forcelc(iat, 0) *= (ucell.tpiba * ucell.omega);
-        forcelc(iat, 1) *= (ucell.tpiba * ucell.omega);
-        forcelc(iat, 2) *= (ucell.tpiba * ucell.omega);
+
+        gcar_h.resize(rho_basis->npw * 3);
+        for(int ig = 0; ig < rho_basis->npw; ++ig)
+        {
+            gcar_h[ig * 3] = rho_basis->gcar[ig].x;
+            gcar_h[ig * 3 + 1] = rho_basis->gcar[ig].y;
+            gcar_h[ig * 3 + 2] = rho_basis->gcar[ig].z;
+        }
+    }
+    int* iat2it_d = nullptr;
+    int* ig2gg_d = nullptr;
+    double* gcar_d = nullptr;
+    double* tau_d = nullptr;
+    std::complex<double>* aux_d = nullptr;
+    double* forcelc_d  = nullptr;
+    double* vloc_d = nullptr;
+    if(this->device == base_device::GpuDevice)
+    {
+        resmem_int_op()(iat2it_d, this->nat);
+        resmem_int_op()(ig2gg_d, rho_basis->npw);
+        resmem_var_op()(gcar_d, rho_basis->npw * 3);
+        resmem_var_op()(tau_d, this->nat * 3);
+        resmem_complex_op()(aux_d, rho_basis->npw);
+        resmem_var_op()(forcelc_d, this->nat * 3);
+        resmem_var_op()(vloc_d, vloc.nr * vloc.nc);
+
+        syncmem_int_h2d_op()(iat2it_d, ucell.iat2it, this->nat);
+        syncmem_int_h2d_op()(ig2gg_d, rho_basis->ig2igg, rho_basis->npw);
+        syncmem_var_h2d_op()(gcar_d, gcar_h.data(), rho_basis->npw * 3);
+        syncmem_var_h2d_op()(tau_d, tau_h.data(), this->nat * 3);
+        syncmem_complex_h2d_op()(aux_d, aux, rho_basis->npw);
+        syncmem_var_h2d_op()(forcelc_d, forcelc.c, this->nat * 3);
+        syncmem_var_h2d_op()(vloc_d, vloc.c, vloc.nr * vloc.nc);
     }
 
+    if(this->device == base_device::GpuDevice)
+    {
+        hamilt::cal_force_loc_op<FPTYPE, Device>()(
+            this->nat,
+            rho_basis->npw,
+            ucell.tpiba * ucell.omega,
+            iat2it_d,
+            ig2gg_d,
+            gcar_d,
+            tau_d,
+            aux_d,
+            vloc_d,
+            vloc.nc,
+            forcelc_d);
+        syncmem_var_d2h_op()(forcelc.c, forcelc_d, this->nat * 3);
+    }
+    else{
+        #ifdef _OPENMP
+        #pragma omp parallel for
+        #endif
+        for (int iat = 0; iat < this->nat; ++iat)
+        {
+            // read `it` `ia` from the table
+            int it = ucell.iat2it[iat];
+            int ia = ucell.iat2ia[iat];
+            for (int ig = 0; ig < rho_basis->npw; ig++)
+            {
+                const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]);
+                double sinp, cosp;
+                ModuleBase::libm::sincos(phase, &sinp, &cosp);
+                const double factor
+                    = vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+                forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor;
+                forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor;
+                forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor;
+            }
+            forcelc(iat, 0) *= (ucell.tpiba * ucell.omega);
+            forcelc(iat, 1) *= (ucell.tpiba * ucell.omega);
+            forcelc(iat, 2) *= (ucell.tpiba * ucell.omega);
+        }
+    }
+    if(this->device == base_device::GpuDevice)
+    {
+        delmem_int_op()(iat2it_d);
+        delmem_int_op()(ig2gg_d);
+        delmem_var_op()(gcar_d);
+        delmem_var_op()(tau_d);
+        delmem_complex_op()(aux_d);
+        delmem_var_op()(forcelc_d);
+        delmem_var_op()(vloc_d);
+    }
     // this->print(GlobalV::ofs_running, "local forces", forcelc);
     Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc);
     delete[] aux;
diff --git a/source/source_pw/module_pwdft/kernels/cuda/force_op.cu b/source/source_pw/module_pwdft/kernels/cuda/force_op.cu
@@ -1,6 +1,7 @@
 #include "source_pw/module_pwdft/kernels/force_op.h"
 // #include "source_psi/kernels/device.h"
 #include "source_base/module_device/types.h"
+#include "source_base/constants.h"
 
 #include <complex>
 
@@ -10,9 +11,19 @@
 #include <source_base/module_device/device.h>
 
 #define THREADS_PER_BLOCK 256
+#define FULL_MASK 0xffffffff
+#define WARP_SIZE 32
 
 namespace hamilt {
 
+template <typename FPTYPE>
+__forceinline__
+__device__
+void warp_reduce(FPTYPE & val) {
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        val += __shfl_down_sync(FULL_MASK, val, offset);
+    }
+}
 
 template <typename FPTYPE>
 __global__ void cal_vkb1_nl(
@@ -606,15 +617,139 @@ void revertVkbValues(
     cudaCheckOnDebug();
 }
 
+template <typename FPTYPE>
+__global__ void force_loc_kernel(
+    const int nat,
+    const int npw,
+    const FPTYPE tpiba_omega,
+    const int* iat2it,
+    const int* ig2gg_d,
+    const FPTYPE* gcar_d,
+    const FPTYPE* tau_d,
+    const thrust::complex<FPTYPE>* aux_d,
+    const FPTYPE* vloc_d,
+    const int vloc_nc,
+    FPTYPE* forcelc_d)
+{
+    const int iat = blockIdx.x;
+    const int tid = threadIdx.x;
+    const int warp_id = tid / WARP_SIZE;
+    const int lane_id = tid % WARP_SIZE;
+    
+    if (iat >= nat) return;
+    const int it = iat2it[iat]; // get the type of atom
+
+    // Initialize force components
+    FPTYPE force_x = 0.0;
+    FPTYPE force_y = 0.0;
+    FPTYPE force_z = 0.0;
+    
+    const auto tau_x = tau_d[iat * 3 + 0];
+    const auto tau_y = tau_d[iat * 3 + 1];
+    const auto tau_z = tau_d[iat * 3 + 2];
+
+    // Process all plane waves in chunks of blockDim.x
+    for (int ig = tid; ig < npw; ig += blockDim.x) {
+        const auto gcar_x = gcar_d[ig * 3 + 0];
+        const auto gcar_y = gcar_d[ig * 3 + 1];
+        const auto gcar_z = gcar_d[ig * 3 + 2];
+
+        // Calculate phase factor
+        const FPTYPE phase = ModuleBase::TWO_PI * (gcar_x * tau_x +
+                                                   gcar_y * tau_y +
+                                                   gcar_z * tau_z);
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Get vloc value
+        const FPTYPE vloc_val = vloc_d[it * vloc_nc + ig2gg_d[ig]];
+        
+        // Calculate factor
+        const auto aux_val = aux_d[ig];
+        const FPTYPE factor = vloc_val * (cosp * aux_val.imag() + sinp * aux_val.real());
+        
+        // Multiply by gcar components
+        force_x += gcar_x * factor;
+        force_y += gcar_y * factor;
+        force_z += gcar_z * factor;
+    }
+    
+    // Warp-level reduction
+    warp_reduce<FPTYPE>(force_x);
+    warp_reduce<FPTYPE>(force_y);
+    warp_reduce<FPTYPE>(force_z);
+    
+    // First thread in each warp writes to shared memory
+    __shared__ FPTYPE warp_sums_x[THREADS_PER_BLOCK / WARP_SIZE]; // 256 threads / 32 = 8 warps
+    __shared__ FPTYPE warp_sums_y[THREADS_PER_BLOCK / WARP_SIZE];
+    __shared__ FPTYPE warp_sums_z[THREADS_PER_BLOCK / WARP_SIZE];
+    
+    if (lane_id == 0) {
+        warp_sums_x[warp_id] = force_x;
+        warp_sums_y[warp_id] = force_y;
+        warp_sums_z[warp_id] = force_z;
+    }
+    
+    __syncthreads();
+    
+    // Final reduction by first warp
+    if (warp_id == 0) {
+        FPTYPE final_x = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_x[lane_id] : 0.0;
+        FPTYPE final_y = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_y[lane_id] : 0.0;
+        FPTYPE final_z = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_z[lane_id] : 0.0;
+        
+        warp_reduce<FPTYPE>(final_x);
+        warp_reduce<FPTYPE>(final_y);
+        warp_reduce<FPTYPE>(final_z);
+        
+        if (lane_id == 0) {
+            forcelc_d[iat * 3 + 0] = final_x * tpiba_omega;
+            forcelc_d[iat * 3 + 1] = final_y * tpiba_omega;
+            forcelc_d[iat * 3 + 2] = final_z * tpiba_omega;
+        }
+    }
+}
+
+template <typename FPTYPE>
+void cal_force_loc_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const int nat,
+    const int npw,
+    const FPTYPE tpiba_omega,
+    const int* iat2it,
+    const int* ig2igg,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const std::complex<FPTYPE>* aux,
+    const FPTYPE* vloc,
+    const int vloc_nc,
+    FPTYPE* forcelc)
+{
+    force_loc_kernel<FPTYPE>
+        <<<nat, THREADS_PER_BLOCK>>>(nat,
+                                     npw,
+                                     tpiba_omega,
+                                     iat2it,
+                                     ig2igg,
+                                     gcar,
+                                     tau,
+                                     reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+                                     vloc,
+                                     vloc_nc,
+                                     forcelc); // array of data
+
+}
+
+
 // for revertVkbValues functions instantiation
 template void revertVkbValues<double>(const int *gcar_zero_ptrs, std::complex<double> *vkb_ptr, const std::complex<double> *vkb_save_ptr, int nkb, int gcar_zero_count, int npw, int ipol, int npwx, const std::complex<double> coeff);
 // for saveVkbValues functions instantiation
 template void saveVkbValues<double>(const int *gcar_zero_ptrs, const std::complex<double> *vkb_ptr, std::complex<double> *vkb_save_ptr, int nkb, int gcar_zero_count, int npw, int ipol, int npwx);
 
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_loc_op<float, base_device::DEVICE_GPU>;
 
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_GPU>;
-
+template struct cal_force_loc_op<double, base_device::DEVICE_GPU>;
 }  // namespace hamilt
diff --git a/source/source_pw/module_pwdft/kernels/force_op.h b/source/source_pw/module_pwdft/kernels/force_op.h
@@ -149,6 +149,21 @@ struct cal_force_nl_op
                     FPTYPE* force);
 };
 
+template <typename FPTYPE, typename Device>
+struct cal_force_loc_op{
+    void operator()(
+        const int nat,
+        const int npw,
+        const FPTYPE tpiba_omega,
+        const int* iat2it,
+        const int* ig2igg,
+        const FPTYPE* gcar,
+        const FPTYPE* tau,
+        const std::complex<FPTYPE>* aux,
+        const FPTYPE* vloc,
+        const int vloc_nr,
+        FPTYPE* forcelc) {};
+};
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 template <typename FPTYPE>
 struct cal_vkb1_nl_op<FPTYPE, base_device::DEVICE_GPU>
@@ -275,6 +290,21 @@ void saveVkbValues(const int* gcar_zero_ptrs,
                    int ipol,
                    int npwx);
 
+template <typename FPTYPE>
+struct cal_force_loc_op<FPTYPE, base_device::DEVICE_GPU>{
+    void operator()(
+        const int nat,
+        const int npw,
+        const FPTYPE tpiba_omega,
+        const int* iat2it,
+        const int* ig2igg,
+        const FPTYPE* gcar,
+        const FPTYPE* tau,
+        const std::complex<FPTYPE>* aux,
+        const FPTYPE* vloc,
+        const int vloc_nr,
+        FPTYPE* forcelc);
+};
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 } // namespace hamilt
 #endif // W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_source_pw_HAMILT_PWDFT_KERNELS_FORCE_OP_H