implement stress_ewa_op for CUDA & RoCM

jieli-matrix · jieli-matrix · commit 9c6eafb49c73 · 2025-06-17T14:50:44.000+08:00
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
@@ -1136,6 +1136,87 @@ template struct cal_force_npw_op<float, base_device::DEVICE_GPU>;
 template struct cal_multi_dot_op<double, base_device::DEVICE_GPU>;
 template struct cal_multi_dot_op<float, base_device::DEVICE_GPU>;
 
+// CUDA kernel for stress Ewald sincos calculation
+template <typename FPTYPE>
+__global__ void cal_stress_ewa_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* zv_facts,
+    FPTYPE* rhostar_real,
+    FPTYPE* rhostar_imag)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+
+    if (iat >= nat) return;
+
+    // Load atom information to registers
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    const FPTYPE zv = zv_facts[iat];
+
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Skip G=0 term
+        if (ig == ig_gge0) continue;
+
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x +
+                                       gcar[ig * 3 + 1] * tau_y +
+                                       gcar[ig * 3 + 2] * tau_z);
+
+        // Use CUDA intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+
+        // Atomic add to accumulate structure factor
+        atomicAdd(&rhostar_real[ig], zv * cosp);
+        atomicAdd(&rhostar_imag[ig], zv * sinp);
+    }
+}
+
+// GPU operators
+template <typename FPTYPE>
+void cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* zv_facts,
+    FPTYPE* rhostar_real,
+    FPTYPE* rhostar_imag)
+{
+    // Note: Arrays are already initialized to zero in the calling function
+    // No need to initialize again here to avoid redundant operations
+
+    // Calculate optimal grid configuration for GPU load balancing
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+
+    cal_stress_ewa_sincos_kernel<FPTYPE><<<grid, block>>>(
+        nat, npw, ig_gge0, gcar, tau, zv_facts,
+        rhostar_real, rhostar_imag
+    );
+
+    cudaCheckOnDebug();
+}
+
+template struct cal_stress_ewa_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_stress_ewa_sincos_op<double, base_device::DEVICE_GPU>;
+
 // template struct prepare_vkb_deri_ptr_op<double, base_device::DEVICE_GPU>;
 // template struct prepare_vkb_deri_ptr_op<float, base_device::DEVICE_GPU>;
 }  // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
@@ -1117,4 +1117,86 @@ template struct cal_force_npw_op<float, base_device::DEVICE_GPU>;
 
 template struct cal_multi_dot_op<double, base_device::DEVICE_GPU>;
 template struct cal_multi_dot_op<float, base_device::DEVICE_GPU>;
+
+// HIP kernel for stress Ewald sincos calculation
+template <typename FPTYPE>
+__global__ void cal_stress_ewa_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* zv_facts,
+    FPTYPE* rhostar_real,
+    FPTYPE* rhostar_imag)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+
+    if (iat >= nat) return;
+
+    // Load atom information to registers
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    const FPTYPE zv = zv_facts[iat];
+
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Skip G=0 term
+        if (ig == ig_gge0) continue;
+
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x +
+                                       gcar[ig * 3 + 1] * tau_y +
+                                       gcar[ig * 3 + 2] * tau_z);
+
+        // Use HIP intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+
+        // Atomic add to accumulate structure factor
+        atomicAdd(&rhostar_real[ig], zv * cosp);
+        atomicAdd(&rhostar_imag[ig], zv * sinp);
+    }
+}
+
+// GPU operators
+template <typename FPTYPE>
+void cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* zv_facts,
+    FPTYPE* rhostar_real,
+    FPTYPE* rhostar_imag)
+{
+    // Note: Arrays are already initialized to zero in the calling function
+    // No need to initialize again here to avoid redundant operations
+
+    // Calculate optimal grid configuration for GPU load balancing
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_ewa_sincos_kernel<FPTYPE>), grid, block, 0, 0,
+        nat, npw, ig_gge0, gcar, tau, zv_facts,
+        rhostar_real, rhostar_imag
+    );
+
+    hipCheckOnDebug();
+}
+
+template struct cal_stress_ewa_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_stress_ewa_sincos_op<double, base_device::DEVICE_GPU>;
+
 }  // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
@@ -775,9 +775,8 @@ struct cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_CPU>
     {
         const FPTYPE TWO_PI = 2.0 * M_PI;
 
-        // Initialize output arrays
-        std::fill(rhostar_real, rhostar_real + npw, static_cast<FPTYPE>(0.0));
-        std::fill(rhostar_imag, rhostar_imag + npw, static_cast<FPTYPE>(0.0));
+        // Note: Arrays are already initialized to zero in the calling function
+        // No need to initialize again here to avoid redundant operations
 
 #ifdef _OPENMP
 #pragma omp parallel for
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_ewa.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_ewa.cpp
@@ -83,19 +83,70 @@ void Stress_Func<FPTYPE, Device>::stress_ewa(const UnitCell& ucell,
     std::vector<FPTYPE> rhostar_real_host(rho_basis->npw);
     std::vector<FPTYPE> rhostar_imag_host(rho_basis->npw);
 
+    // Device pointers for GPU data transfer
+    FPTYPE* d_gcar = nullptr;
+    FPTYPE* d_tau = nullptr;
+    FPTYPE* d_zv_facts = nullptr;
+    FPTYPE* d_rhostar_real = nullptr;
+    FPTYPE* d_rhostar_imag = nullptr;
+
+    // GPU memory management and data transfer
+    if (this->device == base_device::GpuDevice) {
+        // Allocate GPU memory
+        resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
+        resmem_var_op()(this->ctx, d_tau, ucell.nat * 3);
+        resmem_var_op()(this->ctx, d_zv_facts, ucell.nat);
+        resmem_var_op()(this->ctx, d_rhostar_real, rho_basis->npw);
+        resmem_var_op()(this->ctx, d_rhostar_imag, rho_basis->npw);
+
+        // Data transfer H2D
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), ucell.nat * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_zv_facts, zv_facts_host.data(), ucell.nat);
+
+        // Initialize output arrays to zero on GPU
+        base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_rhostar_real, 0.0, rho_basis->npw);
+        base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_rhostar_imag, 0.0, rho_basis->npw);
+    } else {
+        // CPU case: use host pointers directly and initialize arrays to zero
+        d_gcar = gcar_flat.data();
+        d_tau = tau_flat.data();
+        d_zv_facts = zv_facts_host.data();
+        d_rhostar_real = rhostar_real_host.data();
+        d_rhostar_imag = rhostar_imag_host.data();
+
+        // Initialize output arrays to zero for CPU case
+        std::fill(rhostar_real_host.begin(), rhostar_real_host.end(), static_cast<FPTYPE>(0.0));
+        std::fill(rhostar_imag_host.begin(), rhostar_imag_host.end(), static_cast<FPTYPE>(0.0));
+    }
+
     // Call sincos op (outside OpenMP parallel region, op has its own parallelization)
     hamilt::cal_stress_ewa_sincos_op<FPTYPE, Device>()(
         this->ctx,
         ucell.nat,
         rho_basis->npw,
         rho_basis->ig_gge0,
-        gcar_flat.data(),
-        tau_flat.data(),
-        zv_facts_host.data(),
-        rhostar_real_host.data(),
-        rhostar_imag_host.data()
+        d_gcar,
+        d_tau,
+        d_zv_facts,
+        d_rhostar_real,
+        d_rhostar_imag
     );
 
+    // GPU data transfer D2H and memory cleanup
+    if (this->device == base_device::GpuDevice) {
+        // Transfer results back to host
+        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, rhostar_real_host.data(), d_rhostar_real, rho_basis->npw);
+        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, rhostar_imag_host.data(), d_rhostar_imag, rho_basis->npw);
+
+        // Free GPU memory
+        delmem_var_op()(this->ctx, d_gcar);
+        delmem_var_op()(this->ctx, d_tau);
+        delmem_var_op()(this->ctx, d_zv_facts);
+        delmem_var_op()(this->ctx, d_rhostar_real);
+        delmem_var_op()(this->ctx, d_rhostar_imag);
+    }
+
 #ifdef _OPENMP
 #pragma omp parallel
 {