Skip to content

Commit 8a6339c

Browse files
committed
fix sincos op for gpu&cpu
1 parent fbfc91a commit 8a6339c

File tree

9 files changed

+751
-1106
lines changed

9 files changed

+751
-1106
lines changed

source/module_hamilt_pw/hamilt_pwdft/forces.cpp

Lines changed: 341 additions & 273 deletions
Large diffs are not rendered by default.

source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,125 @@
1313

1414
namespace hamilt {
1515

16+
// CUDA kernels for sincos loops
17+
template <typename FPTYPE>
18+
__global__ void cal_force_loc_sincos_kernel(
19+
const int nat,
20+
const int npw,
21+
const int ntype,
22+
const FPTYPE* gcar,
23+
const FPTYPE* tau,
24+
const int* iat2it,
25+
const FPTYPE* vloc_per_type,
26+
const thrust::complex<FPTYPE>* aux,
27+
const FPTYPE scale_factor,
28+
FPTYPE* force)
29+
{
30+
const FPTYPE TWO_PI = 2.0 * M_PI;
31+
32+
const int iat = blockIdx.y;
33+
const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
34+
const int ig_stride = gridDim.x * blockDim.x;
35+
36+
if (iat >= nat) return;
37+
38+
// Load atom information to registers
39+
const int it = iat2it[iat];
40+
const FPTYPE tau_x = tau[iat * 3 + 0];
41+
const FPTYPE tau_y = tau[iat * 3 + 1];
42+
const FPTYPE tau_z = tau[iat * 3 + 2];
43+
44+
// Local accumulation variables
45+
FPTYPE local_force_x = 0.0;
46+
FPTYPE local_force_y = 0.0;
47+
FPTYPE local_force_z = 0.0;
48+
49+
// Grid-stride loop over G-vectors
50+
for (int ig = ig_start; ig < npw; ig += ig_stride) {
51+
// Calculate phase: 2π * (G · τ)
52+
const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x +
53+
gcar[ig * 3 + 1] * tau_y +
54+
gcar[ig * 3 + 2] * tau_z);
55+
56+
// Use CUDA intrinsic for sincos
57+
FPTYPE sinp, cosp;
58+
sincos(phase, &sinp, &cosp);
59+
60+
// Calculate force factor
61+
const FPTYPE vloc_factor = vloc_per_type[it * npw + ig];
62+
const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
63+
64+
// Accumulate force contributions
65+
local_force_x += gcar[ig * 3 + 0] * factor;
66+
local_force_y += gcar[ig * 3 + 1] * factor;
67+
local_force_z += gcar[ig * 3 + 2] * factor;
68+
}
69+
70+
// Atomic add to global memory
71+
atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
72+
atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
73+
atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
74+
}
75+
76+
template <typename FPTYPE>
77+
__global__ void cal_force_ew_sincos_kernel(
78+
const int nat,
79+
const int npw,
80+
const int ig_gge0,
81+
const FPTYPE* gcar,
82+
const FPTYPE* tau,
83+
const int* iat2it,
84+
const FPTYPE* it_facts,
85+
const thrust::complex<FPTYPE>* aux,
86+
FPTYPE* force)
87+
{
88+
const FPTYPE TWO_PI = 2.0 * M_PI;
89+
90+
const int iat = blockIdx.y;
91+
const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
92+
const int ig_stride = gridDim.x * blockDim.x;
93+
94+
if (iat >= nat) return;
95+
96+
// Load atom information to registers
97+
const FPTYPE tau_x = tau[iat * 3 + 0];
98+
const FPTYPE tau_y = tau[iat * 3 + 1];
99+
const FPTYPE tau_z = tau[iat * 3 + 2];
100+
const FPTYPE it_fact = it_facts[iat];
101+
102+
// Local accumulation variables
103+
FPTYPE local_force_x = 0.0;
104+
FPTYPE local_force_y = 0.0;
105+
FPTYPE local_force_z = 0.0;
106+
107+
// Grid-stride loop over G-vectors
108+
for (int ig = ig_start; ig < npw; ig += ig_stride) {
109+
// Skip G=0 term
110+
if (ig == ig_gge0) continue;
111+
112+
// Calculate phase: 2π * (G · τ)
113+
const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x +
114+
gcar[ig * 3 + 1] * tau_y +
115+
gcar[ig * 3 + 2] * tau_z);
116+
117+
// Use CUDA intrinsic for sincos
118+
FPTYPE sinp, cosp;
119+
sincos(phase, &sinp, &cosp);
120+
121+
// Calculate Ewald sum contribution
122+
const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real());
123+
124+
// Accumulate force contributions
125+
local_force_x += gcar[ig * 3 + 0] * factor;
126+
local_force_y += gcar[ig * 3 + 1] * factor;
127+
local_force_z += gcar[ig * 3 + 2] * factor;
128+
}
129+
130+
// Atomic add to global memory
131+
atomicAdd(&force[iat * 3 + 0], local_force_x);
132+
atomicAdd(&force[iat * 3 + 1], local_force_y);
133+
atomicAdd(&force[iat * 3 + 2], local_force_z);
134+
}
16135

17136
template <typename FPTYPE>
18137
__global__ void cal_vkb1_nl(
@@ -188,6 +307,67 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
188307
cudaCheckOnDebug();
189308
}
190309

310+
// GPU operators
311+
template <typename FPTYPE>
312+
void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
313+
const base_device::DEVICE_GPU* ctx,
314+
const int& nat,
315+
const int& npw,
316+
const int& ntype,
317+
const FPTYPE* gcar,
318+
const FPTYPE* tau,
319+
const int* iat2it,
320+
const FPTYPE* vloc_per_type,
321+
const std::complex<FPTYPE>* aux,
322+
const FPTYPE& scale_factor,
323+
FPTYPE* force)
324+
{
325+
// Calculate optimal grid configuration for GPU load balancing
326+
const int threads_per_block = THREADS_PER_BLOCK;
327+
const int max_blocks_per_sm = 32; // Configurable per GPU architecture
328+
const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
329+
330+
dim3 grid(max_blocks_x, nat);
331+
dim3 block(threads_per_block);
332+
333+
cal_force_loc_sincos_kernel<FPTYPE><<<grid, block>>>(
334+
nat, npw, ntype, gcar, tau, iat2it, vloc_per_type,
335+
reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
336+
scale_factor, force
337+
);
338+
339+
cudaCheckOnDebug();
340+
}
341+
342+
template <typename FPTYPE>
343+
void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
344+
const base_device::DEVICE_GPU* ctx,
345+
const int& nat,
346+
const int& npw,
347+
const int& ig_gge0,
348+
const FPTYPE* gcar,
349+
const FPTYPE* tau,
350+
const int* iat2it,
351+
const FPTYPE* it_facts,
352+
const std::complex<FPTYPE>* aux,
353+
FPTYPE* force)
354+
{
355+
// Calculate optimal grid configuration for GPU load balancing
356+
const int threads_per_block = THREADS_PER_BLOCK;
357+
const int max_blocks_per_sm = 32; // Configurable per GPU architecture
358+
const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
359+
360+
dim3 grid(max_blocks_x, nat);
361+
dim3 block(threads_per_block);
362+
363+
cal_force_ew_sincos_kernel<FPTYPE><<<grid, block>>>(
364+
nat, npw, ig_gge0, gcar, tau, iat2it, it_facts,
365+
reinterpret_cast<const thrust::complex<FPTYPE>*>(aux), force
366+
);
367+
368+
cudaCheckOnDebug();
369+
}
370+
191371
template <typename FPTYPE>
192372
__global__ void cal_force_nl(
193373
const int ntype,
@@ -613,8 +793,12 @@ template void saveVkbValues<double>(const int *gcar_zero_ptrs, const std::comple
613793

614794
template struct cal_vkb1_nl_op<float, base_device::DEVICE_GPU>;
615795
template struct cal_force_nl_op<float, base_device::DEVICE_GPU>;
796+
template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
797+
template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
616798

617799
template struct cal_vkb1_nl_op<double, base_device::DEVICE_GPU>;
618800
template struct cal_force_nl_op<double, base_device::DEVICE_GPU>;
801+
template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
802+
template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
619803

620804
} // namespace hamilt

0 commit comments

Comments
 (0)