@@ -1119,6 +1119,7 @@ template struct cal_multi_dot_op<double, base_device::DEVICE_GPU>;
11191119template struct cal_multi_dot_op <float , base_device::DEVICE_GPU>;
11201120
11211121// HIP kernel for stress Ewald sincos calculation
1122+ // Fixed: Parallelize over G-vectors instead of atoms to avoid race conditions
11221123template <typename FPTYPE>
11231124__global__ void cal_stress_ewa_sincos_kernel (
11241125 const int nat,
@@ -1132,36 +1133,48 @@ __global__ void cal_stress_ewa_sincos_kernel(
11321133{
11331134 const FPTYPE TWO_PI = 2.0 * M_PI;
11341135
1135- const int iat = blockIdx .y ;
1136- const int ig_start = blockIdx .x * blockDim .x + threadIdx .x ;
1137- const int ig_stride = gridDim .x * blockDim .x ;
1136+ // Parallelize over G-vectors (ig) instead of atoms (iat)
1137+ const int ig = blockIdx .x * blockDim .x + threadIdx .x ;
11381138
1139- if (iat >= nat ) return ;
1139+ if (ig >= npw ) return ;
11401140
1141- // Load atom information to registers
1142- const FPTYPE tau_x = tau[iat * 3 + 0 ];
1143- const FPTYPE tau_y = tau[iat * 3 + 1 ];
1144- const FPTYPE tau_z = tau[iat * 3 + 2 ];
1145- const FPTYPE zv = zv_facts[iat];
1141+ // Skip G=0 term
1142+ if (ig == ig_gge0) return ;
11461143
1147- // Grid-stride loop over G-vectors
1148- for (int ig = ig_start; ig < npw; ig += ig_stride) {
1149- // Skip G=0 term
1150- if (ig == ig_gge0) continue ;
1144+ // Local accumulation variables for this G-vector
1145+ FPTYPE local_rhostar_real = 0.0 ;
1146+ FPTYPE local_rhostar_imag = 0.0 ;
11511147
1152- // Calculate phase: 2π * (G · τ)
1153- const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0 ] * tau_x +
1154- gcar[ig * 3 + 1 ] * tau_y +
1155- gcar[ig * 3 + 2 ] * tau_z);
1148+ // Load G-vector components to registers
1149+ const FPTYPE gcar_x = gcar[ig * 3 + 0 ];
1150+ const FPTYPE gcar_y = gcar[ig * 3 + 1 ];
1151+ const FPTYPE gcar_z = gcar[ig * 3 + 2 ];
1152+
1153+ // Loop over all atoms for this G-vector (matches CPU implementation)
1154+ for (int iat = 0 ; iat < nat; iat++) {
1155+ // Load atom information to registers
1156+ const FPTYPE tau_x = tau[iat * 3 + 0 ];
1157+ const FPTYPE tau_y = tau[iat * 3 + 1 ];
1158+ const FPTYPE tau_z = tau[iat * 3 + 2 ];
1159+ const FPTYPE zv = zv_facts[iat];
1160+
1161+ // Calculate phase: 2π * (G · τ) - same as CPU implementation
1162+ const FPTYPE phase = TWO_PI * (gcar_x * tau_x +
1163+ gcar_y * tau_y +
1164+ gcar_z * tau_z);
11561165
11571166 // Use HIP intrinsic for sincos
11581167 FPTYPE sinp, cosp;
11591168 sincos (phase, &sinp, &cosp);
11601169
1161- // Atomic add to accumulate structure factor
1162- atomicAdd (&rhostar_real[ig], zv * cosp) ;
1163- atomicAdd (&rhostar_imag[ig], zv * sinp) ;
1170+ // Accumulate structure factor locally (no race conditions)
1171+ local_rhostar_real += zv * cosp;
1172+ local_rhostar_imag += zv * sinp;
11641173 }
1174+
1175+ // Store results - each thread writes to unique memory location
1176+ rhostar_real[ig] = local_rhostar_real;
1177+ rhostar_imag[ig] = local_rhostar_imag;
11651178}
11661179
11671180// GPU operators
@@ -1177,15 +1190,12 @@ void cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
11771190 FPTYPE* rhostar_real,
11781191 FPTYPE* rhostar_imag)
11791192{
1180- // Note: Arrays are already initialized to zero in the calling function
1181- // No need to initialize again here to avoid redundant operations
1182-
1183- // Calculate optimal grid configuration for GPU load balancing
1193+ // Calculate grid configuration for G-vector parallelization
11841194 const int threads_per_block = THREADS_PER_BLOCK;
1185- const int max_blocks_per_sm = 32 ; // Configurable per GPU architecture
1186- const int max_blocks_x = std::min (max_blocks_per_sm, (npw + threads_per_block - 1 ) / threads_per_block);
1195+ const int num_blocks = (npw + threads_per_block - 1 ) / threads_per_block;
11871196
1188- dim3 grid (max_blocks_x, nat);
1197+ // Use 1D grid since we're parallelizing over G-vectors only
1198+ dim3 grid (num_blocks);
11891199 dim3 block (threads_per_block);
11901200
11911201 hipLaunchKernelGGL (HIP_KERNEL_NAME (cal_stress_ewa_sincos_kernel<FPTYPE>), grid, block, 0 , 0 ,
0 commit comments