Skip to content

Commit c78f9a6

Browse files
committed
fix racing condition
1 parent 9c6eafb commit c78f9a6

File tree

2 files changed

+74
-54
lines changed

2 files changed

+74
-54
lines changed

source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu

Lines changed: 37 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1137,6 +1137,7 @@ template struct cal_multi_dot_op<double, base_device::DEVICE_GPU>;
11371137
template struct cal_multi_dot_op<float, base_device::DEVICE_GPU>;
11381138

11391139
// CUDA kernel for stress Ewald sincos calculation
1140+
// Fixed: Parallelize over G-vectors instead of atoms to avoid race conditions
11401141
template <typename FPTYPE>
11411142
__global__ void cal_stress_ewa_sincos_kernel(
11421143
const int nat,
@@ -1150,36 +1151,48 @@ __global__ void cal_stress_ewa_sincos_kernel(
11501151
{
11511152
const FPTYPE TWO_PI = 2.0 * M_PI;
11521153

1153-
const int iat = blockIdx.y;
1154-
const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
1155-
const int ig_stride = gridDim.x * blockDim.x;
1154+
// Parallelize over G-vectors (ig) instead of atoms (iat)
1155+
const int ig = blockIdx.x * blockDim.x + threadIdx.x;
11561156

1157-
if (iat >= nat) return;
1157+
if (ig >= npw) return;
11581158

1159-
// Load atom information to registers
1160-
const FPTYPE tau_x = tau[iat * 3 + 0];
1161-
const FPTYPE tau_y = tau[iat * 3 + 1];
1162-
const FPTYPE tau_z = tau[iat * 3 + 2];
1163-
const FPTYPE zv = zv_facts[iat];
1159+
// Skip G=0 term
1160+
if (ig == ig_gge0) return;
11641161

1165-
// Grid-stride loop over G-vectors
1166-
for (int ig = ig_start; ig < npw; ig += ig_stride) {
1167-
// Skip G=0 term
1168-
if (ig == ig_gge0) continue;
1162+
// Local accumulation variables for this G-vector
1163+
FPTYPE local_rhostar_real = 0.0;
1164+
FPTYPE local_rhostar_imag = 0.0;
11691165

1170-
// Calculate phase: 2π * (G · τ)
1171-
const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x +
1172-
gcar[ig * 3 + 1] * tau_y +
1173-
gcar[ig * 3 + 2] * tau_z);
1166+
// Load G-vector components to registers
1167+
const FPTYPE gcar_x = gcar[ig * 3 + 0];
1168+
const FPTYPE gcar_y = gcar[ig * 3 + 1];
1169+
const FPTYPE gcar_z = gcar[ig * 3 + 2];
1170+
1171+
// Loop over all atoms for this G-vector (matches CPU implementation)
1172+
for (int iat = 0; iat < nat; iat++) {
1173+
// Load atom information to registers
1174+
const FPTYPE tau_x = tau[iat * 3 + 0];
1175+
const FPTYPE tau_y = tau[iat * 3 + 1];
1176+
const FPTYPE tau_z = tau[iat * 3 + 2];
1177+
const FPTYPE zv = zv_facts[iat];
1178+
1179+
// Calculate phase: 2π * (G · τ) - same as CPU implementation
1180+
const FPTYPE phase = TWO_PI * (gcar_x * tau_x +
1181+
gcar_y * tau_y +
1182+
gcar_z * tau_z);
11741183

11751184
// Use CUDA intrinsic for sincos
11761185
FPTYPE sinp, cosp;
11771186
sincos(phase, &sinp, &cosp);
11781187

1179-
// Atomic add to accumulate structure factor
1180-
atomicAdd(&rhostar_real[ig], zv * cosp);
1181-
atomicAdd(&rhostar_imag[ig], zv * sinp);
1188+
// Accumulate structure factor locally (no race conditions)
1189+
local_rhostar_real += zv * cosp;
1190+
local_rhostar_imag += zv * sinp;
11821191
}
1192+
1193+
// Store results - each thread writes to unique memory location
1194+
rhostar_real[ig] = local_rhostar_real;
1195+
rhostar_imag[ig] = local_rhostar_imag;
11831196
}
11841197

11851198
// GPU operators
@@ -1195,15 +1208,12 @@ void cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
11951208
FPTYPE* rhostar_real,
11961209
FPTYPE* rhostar_imag)
11971210
{
1198-
// Note: Arrays are already initialized to zero in the calling function
1199-
// No need to initialize again here to avoid redundant operations
1200-
1201-
// Calculate optimal grid configuration for GPU load balancing
1211+
1212+
// Calculate grid configuration for G-vector parallelization
12021213
const int threads_per_block = THREADS_PER_BLOCK;
1203-
const int max_blocks_per_sm = 32; // Configurable per GPU architecture
1204-
const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
1214+
const int num_blocks = (npw + threads_per_block - 1) / threads_per_block;
12051215

1206-
dim3 grid(max_blocks_x, nat);
1216+
dim3 grid(num_blocks);
12071217
dim3 block(threads_per_block);
12081218

12091219
cal_stress_ewa_sincos_kernel<FPTYPE><<<grid, block>>>(

source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu

Lines changed: 37 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1119,6 +1119,7 @@ template struct cal_multi_dot_op<double, base_device::DEVICE_GPU>;
11191119
template struct cal_multi_dot_op<float, base_device::DEVICE_GPU>;
11201120

11211121
// HIP kernel for stress Ewald sincos calculation
1122+
// Fixed: Parallelize over G-vectors instead of atoms to avoid race conditions
11221123
template <typename FPTYPE>
11231124
__global__ void cal_stress_ewa_sincos_kernel(
11241125
const int nat,
@@ -1132,36 +1133,48 @@ __global__ void cal_stress_ewa_sincos_kernel(
11321133
{
11331134
const FPTYPE TWO_PI = 2.0 * M_PI;
11341135

1135-
const int iat = blockIdx.y;
1136-
const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
1137-
const int ig_stride = gridDim.x * blockDim.x;
1136+
// Parallelize over G-vectors (ig) instead of atoms (iat)
1137+
const int ig = blockIdx.x * blockDim.x + threadIdx.x;
11381138

1139-
if (iat >= nat) return;
1139+
if (ig >= npw) return;
11401140

1141-
// Load atom information to registers
1142-
const FPTYPE tau_x = tau[iat * 3 + 0];
1143-
const FPTYPE tau_y = tau[iat * 3 + 1];
1144-
const FPTYPE tau_z = tau[iat * 3 + 2];
1145-
const FPTYPE zv = zv_facts[iat];
1141+
// Skip G=0 term
1142+
if (ig == ig_gge0) return;
11461143

1147-
// Grid-stride loop over G-vectors
1148-
for (int ig = ig_start; ig < npw; ig += ig_stride) {
1149-
// Skip G=0 term
1150-
if (ig == ig_gge0) continue;
1144+
// Local accumulation variables for this G-vector
1145+
FPTYPE local_rhostar_real = 0.0;
1146+
FPTYPE local_rhostar_imag = 0.0;
11511147

1152-
// Calculate phase: 2π * (G · τ)
1153-
const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x +
1154-
gcar[ig * 3 + 1] * tau_y +
1155-
gcar[ig * 3 + 2] * tau_z);
1148+
// Load G-vector components to registers
1149+
const FPTYPE gcar_x = gcar[ig * 3 + 0];
1150+
const FPTYPE gcar_y = gcar[ig * 3 + 1];
1151+
const FPTYPE gcar_z = gcar[ig * 3 + 2];
1152+
1153+
// Loop over all atoms for this G-vector (matches CPU implementation)
1154+
for (int iat = 0; iat < nat; iat++) {
1155+
// Load atom information to registers
1156+
const FPTYPE tau_x = tau[iat * 3 + 0];
1157+
const FPTYPE tau_y = tau[iat * 3 + 1];
1158+
const FPTYPE tau_z = tau[iat * 3 + 2];
1159+
const FPTYPE zv = zv_facts[iat];
1160+
1161+
// Calculate phase: 2π * (G · τ) - same as CPU implementation
1162+
const FPTYPE phase = TWO_PI * (gcar_x * tau_x +
1163+
gcar_y * tau_y +
1164+
gcar_z * tau_z);
11561165

11571166
// Use HIP intrinsic for sincos
11581167
FPTYPE sinp, cosp;
11591168
sincos(phase, &sinp, &cosp);
11601169

1161-
// Atomic add to accumulate structure factor
1162-
atomicAdd(&rhostar_real[ig], zv * cosp);
1163-
atomicAdd(&rhostar_imag[ig], zv * sinp);
1170+
// Accumulate structure factor locally (no race conditions)
1171+
local_rhostar_real += zv * cosp;
1172+
local_rhostar_imag += zv * sinp;
11641173
}
1174+
1175+
// Store results - each thread writes to unique memory location
1176+
rhostar_real[ig] = local_rhostar_real;
1177+
rhostar_imag[ig] = local_rhostar_imag;
11651178
}
11661179

11671180
// GPU operators
@@ -1177,15 +1190,12 @@ void cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
11771190
FPTYPE* rhostar_real,
11781191
FPTYPE* rhostar_imag)
11791192
{
1180-
// Note: Arrays are already initialized to zero in the calling function
1181-
// No need to initialize again here to avoid redundant operations
1182-
1183-
// Calculate optimal grid configuration for GPU load balancing
1193+
// Calculate grid configuration for G-vector parallelization
11841194
const int threads_per_block = THREADS_PER_BLOCK;
1185-
const int max_blocks_per_sm = 32; // Configurable per GPU architecture
1186-
const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
1195+
const int num_blocks = (npw + threads_per_block - 1) / threads_per_block;
11871196

1188-
dim3 grid(max_blocks_x, nat);
1197+
// Use 1D grid since we're parallelizing over G-vectors only
1198+
dim3 grid(num_blocks);
11891199
dim3 block(threads_per_block);
11901200

11911201
hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_ewa_sincos_kernel<FPTYPE>), grid, block, 0, 0,

0 commit comments

Comments
 (0)