Skip to content

Commit 9c6eafb

Browse files
committed
implement stress_ewa_op for CUDA & RoCM
1 parent 6148d7b commit 9c6eafb

File tree

4 files changed

+221
-8
lines changed

4 files changed

+221
-8
lines changed

source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1136,6 +1136,87 @@ template struct cal_force_npw_op<float, base_device::DEVICE_GPU>;
11361136
template struct cal_multi_dot_op<double, base_device::DEVICE_GPU>;
11371137
template struct cal_multi_dot_op<float, base_device::DEVICE_GPU>;
11381138

1139+
// CUDA kernel for stress Ewald sincos calculation
1140+
template <typename FPTYPE>
1141+
__global__ void cal_stress_ewa_sincos_kernel(
1142+
const int nat,
1143+
const int npw,
1144+
const int ig_gge0,
1145+
const FPTYPE* gcar,
1146+
const FPTYPE* tau,
1147+
const FPTYPE* zv_facts,
1148+
FPTYPE* rhostar_real,
1149+
FPTYPE* rhostar_imag)
1150+
{
1151+
const FPTYPE TWO_PI = 2.0 * M_PI;
1152+
1153+
const int iat = blockIdx.y;
1154+
const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
1155+
const int ig_stride = gridDim.x * blockDim.x;
1156+
1157+
if (iat >= nat) return;
1158+
1159+
// Load atom information to registers
1160+
const FPTYPE tau_x = tau[iat * 3 + 0];
1161+
const FPTYPE tau_y = tau[iat * 3 + 1];
1162+
const FPTYPE tau_z = tau[iat * 3 + 2];
1163+
const FPTYPE zv = zv_facts[iat];
1164+
1165+
// Grid-stride loop over G-vectors
1166+
for (int ig = ig_start; ig < npw; ig += ig_stride) {
1167+
// Skip G=0 term
1168+
if (ig == ig_gge0) continue;
1169+
1170+
// Calculate phase: 2π * (G · τ)
1171+
const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x +
1172+
gcar[ig * 3 + 1] * tau_y +
1173+
gcar[ig * 3 + 2] * tau_z);
1174+
1175+
// Use CUDA intrinsic for sincos
1176+
FPTYPE sinp, cosp;
1177+
sincos(phase, &sinp, &cosp);
1178+
1179+
// Atomic add to accumulate structure factor
1180+
atomicAdd(&rhostar_real[ig], zv * cosp);
1181+
atomicAdd(&rhostar_imag[ig], zv * sinp);
1182+
}
1183+
}
1184+
1185+
// GPU operators
1186+
template <typename FPTYPE>
1187+
void cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
1188+
const base_device::DEVICE_GPU* ctx,
1189+
const int& nat,
1190+
const int& npw,
1191+
const int& ig_gge0,
1192+
const FPTYPE* gcar,
1193+
const FPTYPE* tau,
1194+
const FPTYPE* zv_facts,
1195+
FPTYPE* rhostar_real,
1196+
FPTYPE* rhostar_imag)
1197+
{
1198+
// Note: Arrays are already initialized to zero in the calling function
1199+
// No need to initialize again here to avoid redundant operations
1200+
1201+
// Calculate optimal grid configuration for GPU load balancing
1202+
const int threads_per_block = THREADS_PER_BLOCK;
1203+
const int max_blocks_per_sm = 32; // Configurable per GPU architecture
1204+
const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
1205+
1206+
dim3 grid(max_blocks_x, nat);
1207+
dim3 block(threads_per_block);
1208+
1209+
cal_stress_ewa_sincos_kernel<FPTYPE><<<grid, block>>>(
1210+
nat, npw, ig_gge0, gcar, tau, zv_facts,
1211+
rhostar_real, rhostar_imag
1212+
);
1213+
1214+
cudaCheckOnDebug();
1215+
}
1216+
1217+
template struct cal_stress_ewa_sincos_op<float, base_device::DEVICE_GPU>;
1218+
template struct cal_stress_ewa_sincos_op<double, base_device::DEVICE_GPU>;
1219+
11391220
// template struct prepare_vkb_deri_ptr_op<double, base_device::DEVICE_GPU>;
11401221
// template struct prepare_vkb_deri_ptr_op<float, base_device::DEVICE_GPU>;
11411222
} // namespace hamilt

source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1117,4 +1117,86 @@ template struct cal_force_npw_op<float, base_device::DEVICE_GPU>;
11171117

11181118
template struct cal_multi_dot_op<double, base_device::DEVICE_GPU>;
11191119
template struct cal_multi_dot_op<float, base_device::DEVICE_GPU>;
1120+
1121+
// HIP kernel for stress Ewald sincos calculation
1122+
template <typename FPTYPE>
1123+
__global__ void cal_stress_ewa_sincos_kernel(
1124+
const int nat,
1125+
const int npw,
1126+
const int ig_gge0,
1127+
const FPTYPE* gcar,
1128+
const FPTYPE* tau,
1129+
const FPTYPE* zv_facts,
1130+
FPTYPE* rhostar_real,
1131+
FPTYPE* rhostar_imag)
1132+
{
1133+
const FPTYPE TWO_PI = 2.0 * M_PI;
1134+
1135+
const int iat = blockIdx.y;
1136+
const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
1137+
const int ig_stride = gridDim.x * blockDim.x;
1138+
1139+
if (iat >= nat) return;
1140+
1141+
// Load atom information to registers
1142+
const FPTYPE tau_x = tau[iat * 3 + 0];
1143+
const FPTYPE tau_y = tau[iat * 3 + 1];
1144+
const FPTYPE tau_z = tau[iat * 3 + 2];
1145+
const FPTYPE zv = zv_facts[iat];
1146+
1147+
// Grid-stride loop over G-vectors
1148+
for (int ig = ig_start; ig < npw; ig += ig_stride) {
1149+
// Skip G=0 term
1150+
if (ig == ig_gge0) continue;
1151+
1152+
// Calculate phase: 2π * (G · τ)
1153+
const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x +
1154+
gcar[ig * 3 + 1] * tau_y +
1155+
gcar[ig * 3 + 2] * tau_z);
1156+
1157+
// Use HIP intrinsic for sincos
1158+
FPTYPE sinp, cosp;
1159+
sincos(phase, &sinp, &cosp);
1160+
1161+
// Atomic add to accumulate structure factor
1162+
atomicAdd(&rhostar_real[ig], zv * cosp);
1163+
atomicAdd(&rhostar_imag[ig], zv * sinp);
1164+
}
1165+
}
1166+
1167+
// GPU operators
1168+
template <typename FPTYPE>
1169+
void cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
1170+
const base_device::DEVICE_GPU* ctx,
1171+
const int& nat,
1172+
const int& npw,
1173+
const int& ig_gge0,
1174+
const FPTYPE* gcar,
1175+
const FPTYPE* tau,
1176+
const FPTYPE* zv_facts,
1177+
FPTYPE* rhostar_real,
1178+
FPTYPE* rhostar_imag)
1179+
{
1180+
// Note: Arrays are already initialized to zero in the calling function
1181+
// No need to initialize again here to avoid redundant operations
1182+
1183+
// Calculate optimal grid configuration for GPU load balancing
1184+
const int threads_per_block = THREADS_PER_BLOCK;
1185+
const int max_blocks_per_sm = 32; // Configurable per GPU architecture
1186+
const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
1187+
1188+
dim3 grid(max_blocks_x, nat);
1189+
dim3 block(threads_per_block);
1190+
1191+
hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_ewa_sincos_kernel<FPTYPE>), grid, block, 0, 0,
1192+
nat, npw, ig_gge0, gcar, tau, zv_facts,
1193+
rhostar_real, rhostar_imag
1194+
);
1195+
1196+
hipCheckOnDebug();
1197+
}
1198+
1199+
template struct cal_stress_ewa_sincos_op<float, base_device::DEVICE_GPU>;
1200+
template struct cal_stress_ewa_sincos_op<double, base_device::DEVICE_GPU>;
1201+
11201202
} // namespace hamilt

source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -775,9 +775,8 @@ struct cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_CPU>
775775
{
776776
const FPTYPE TWO_PI = 2.0 * M_PI;
777777

778-
// Initialize output arrays
779-
std::fill(rhostar_real, rhostar_real + npw, static_cast<FPTYPE>(0.0));
780-
std::fill(rhostar_imag, rhostar_imag + npw, static_cast<FPTYPE>(0.0));
778+
// Note: Arrays are already initialized to zero in the calling function
779+
// No need to initialize again here to avoid redundant operations
781780

782781
#ifdef _OPENMP
783782
#pragma omp parallel for

source/module_hamilt_pw/hamilt_pwdft/stress_func_ewa.cpp

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -83,19 +83,70 @@ void Stress_Func<FPTYPE, Device>::stress_ewa(const UnitCell& ucell,
8383
std::vector<FPTYPE> rhostar_real_host(rho_basis->npw);
8484
std::vector<FPTYPE> rhostar_imag_host(rho_basis->npw);
8585

86+
// Device pointers for GPU data transfer
87+
FPTYPE* d_gcar = nullptr;
88+
FPTYPE* d_tau = nullptr;
89+
FPTYPE* d_zv_facts = nullptr;
90+
FPTYPE* d_rhostar_real = nullptr;
91+
FPTYPE* d_rhostar_imag = nullptr;
92+
93+
// GPU memory management and data transfer
94+
if (this->device == base_device::GpuDevice) {
95+
// Allocate GPU memory
96+
resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
97+
resmem_var_op()(this->ctx, d_tau, ucell.nat * 3);
98+
resmem_var_op()(this->ctx, d_zv_facts, ucell.nat);
99+
resmem_var_op()(this->ctx, d_rhostar_real, rho_basis->npw);
100+
resmem_var_op()(this->ctx, d_rhostar_imag, rho_basis->npw);
101+
102+
// Data transfer H2D
103+
syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
104+
syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), ucell.nat * 3);
105+
syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_zv_facts, zv_facts_host.data(), ucell.nat);
106+
107+
// Initialize output arrays to zero on GPU
108+
base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_rhostar_real, 0.0, rho_basis->npw);
109+
base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_rhostar_imag, 0.0, rho_basis->npw);
110+
} else {
111+
// CPU case: use host pointers directly and initialize arrays to zero
112+
d_gcar = gcar_flat.data();
113+
d_tau = tau_flat.data();
114+
d_zv_facts = zv_facts_host.data();
115+
d_rhostar_real = rhostar_real_host.data();
116+
d_rhostar_imag = rhostar_imag_host.data();
117+
118+
// Initialize output arrays to zero for CPU case
119+
std::fill(rhostar_real_host.begin(), rhostar_real_host.end(), static_cast<FPTYPE>(0.0));
120+
std::fill(rhostar_imag_host.begin(), rhostar_imag_host.end(), static_cast<FPTYPE>(0.0));
121+
}
122+
86123
// Call sincos op (outside OpenMP parallel region, op has its own parallelization)
87124
hamilt::cal_stress_ewa_sincos_op<FPTYPE, Device>()(
88125
this->ctx,
89126
ucell.nat,
90127
rho_basis->npw,
91128
rho_basis->ig_gge0,
92-
gcar_flat.data(),
93-
tau_flat.data(),
94-
zv_facts_host.data(),
95-
rhostar_real_host.data(),
96-
rhostar_imag_host.data()
129+
d_gcar,
130+
d_tau,
131+
d_zv_facts,
132+
d_rhostar_real,
133+
d_rhostar_imag
97134
);
98135

136+
// GPU data transfer D2H and memory cleanup
137+
if (this->device == base_device::GpuDevice) {
138+
// Transfer results back to host
139+
syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, rhostar_real_host.data(), d_rhostar_real, rho_basis->npw);
140+
syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, rhostar_imag_host.data(), d_rhostar_imag, rho_basis->npw);
141+
142+
// Free GPU memory
143+
delmem_var_op()(this->ctx, d_gcar);
144+
delmem_var_op()(this->ctx, d_tau);
145+
delmem_var_op()(this->ctx, d_zv_facts);
146+
delmem_var_op()(this->ctx, d_rhostar_real);
147+
delmem_var_op()(this->ctx, d_rhostar_imag);
148+
}
149+
99150
#ifdef _OPENMP
100151
#pragma omp parallel
101152
{

0 commit comments

Comments
 (0)