|
13 | 13 |
|
14 | 14 | namespace hamilt { |
15 | 15 |
|
| 16 | +// CUDA kernels for sincos loops |
| 17 | +template <typename FPTYPE> |
| 18 | +__global__ void cal_force_loc_sincos_kernel( |
| 19 | + const int nat, |
| 20 | + const int npw, |
| 21 | + const int ntype, |
| 22 | + const FPTYPE* gcar, |
| 23 | + const FPTYPE* tau, |
| 24 | + const int* iat2it, |
| 25 | + const FPTYPE* vloc_per_type, |
| 26 | + const thrust::complex<FPTYPE>* aux, |
| 27 | + const FPTYPE scale_factor, |
| 28 | + FPTYPE* force) |
| 29 | +{ |
| 30 | + const FPTYPE TWO_PI = 2.0 * M_PI; |
| 31 | + |
| 32 | + const int iat = blockIdx.y; |
| 33 | + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; |
| 34 | + const int ig_stride = gridDim.x * blockDim.x; |
| 35 | + |
| 36 | + if (iat >= nat) return; |
| 37 | + |
| 38 | + // Load atom information to registers |
| 39 | + const int it = iat2it[iat]; |
| 40 | + const FPTYPE tau_x = tau[iat * 3 + 0]; |
| 41 | + const FPTYPE tau_y = tau[iat * 3 + 1]; |
| 42 | + const FPTYPE tau_z = tau[iat * 3 + 2]; |
| 43 | + |
| 44 | + // Local accumulation variables |
| 45 | + FPTYPE local_force_x = 0.0; |
| 46 | + FPTYPE local_force_y = 0.0; |
| 47 | + FPTYPE local_force_z = 0.0; |
| 48 | + |
| 49 | + // Grid-stride loop over G-vectors |
| 50 | + for (int ig = ig_start; ig < npw; ig += ig_stride) { |
| 51 | + // Calculate phase: 2π * (G · τ) |
| 52 | + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + |
| 53 | + gcar[ig * 3 + 1] * tau_y + |
| 54 | + gcar[ig * 3 + 2] * tau_z); |
| 55 | + |
| 56 | + // Use CUDA intrinsic for sincos |
| 57 | + FPTYPE sinp, cosp; |
| 58 | + sincos(phase, &sinp, &cosp); |
| 59 | + |
| 60 | + // Calculate force factor |
| 61 | + const FPTYPE vloc_factor = vloc_per_type[it * npw + ig]; |
| 62 | + const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); |
| 63 | + |
| 64 | + // Accumulate force contributions |
| 65 | + local_force_x += gcar[ig * 3 + 0] * factor; |
| 66 | + local_force_y += gcar[ig * 3 + 1] * factor; |
| 67 | + local_force_z += gcar[ig * 3 + 2] * factor; |
| 68 | + } |
| 69 | + |
| 70 | + // Atomic add to global memory |
| 71 | + atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); |
| 72 | + atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); |
| 73 | + atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); |
| 74 | +} |
| 75 | + |
| 76 | +template <typename FPTYPE> |
| 77 | +__global__ void cal_force_ew_sincos_kernel( |
| 78 | + const int nat, |
| 79 | + const int npw, |
| 80 | + const int ig_gge0, |
| 81 | + const FPTYPE* gcar, |
| 82 | + const FPTYPE* tau, |
| 83 | + const int* iat2it, |
| 84 | + const FPTYPE* it_facts, |
| 85 | + const thrust::complex<FPTYPE>* aux, |
| 86 | + FPTYPE* force) |
| 87 | +{ |
| 88 | + const FPTYPE TWO_PI = 2.0 * M_PI; |
| 89 | + |
| 90 | + const int iat = blockIdx.y; |
| 91 | + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; |
| 92 | + const int ig_stride = gridDim.x * blockDim.x; |
| 93 | + |
| 94 | + if (iat >= nat) return; |
| 95 | + |
| 96 | + // Load atom information to registers |
| 97 | + const FPTYPE tau_x = tau[iat * 3 + 0]; |
| 98 | + const FPTYPE tau_y = tau[iat * 3 + 1]; |
| 99 | + const FPTYPE tau_z = tau[iat * 3 + 2]; |
| 100 | + const FPTYPE it_fact = it_facts[iat]; |
| 101 | + |
| 102 | + // Local accumulation variables |
| 103 | + FPTYPE local_force_x = 0.0; |
| 104 | + FPTYPE local_force_y = 0.0; |
| 105 | + FPTYPE local_force_z = 0.0; |
| 106 | + |
| 107 | + // Grid-stride loop over G-vectors |
| 108 | + for (int ig = ig_start; ig < npw; ig += ig_stride) { |
| 109 | + // Skip G=0 term |
| 110 | + if (ig == ig_gge0) continue; |
| 111 | + |
| 112 | + // Calculate phase: 2π * (G · τ) |
| 113 | + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + |
| 114 | + gcar[ig * 3 + 1] * tau_y + |
| 115 | + gcar[ig * 3 + 2] * tau_z); |
| 116 | + |
| 117 | + // Use CUDA intrinsic for sincos |
| 118 | + FPTYPE sinp, cosp; |
| 119 | + sincos(phase, &sinp, &cosp); |
| 120 | + |
| 121 | + // Calculate Ewald sum contribution |
| 122 | + const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real()); |
| 123 | + |
| 124 | + // Accumulate force contributions |
| 125 | + local_force_x += gcar[ig * 3 + 0] * factor; |
| 126 | + local_force_y += gcar[ig * 3 + 1] * factor; |
| 127 | + local_force_z += gcar[ig * 3 + 2] * factor; |
| 128 | + } |
| 129 | + |
| 130 | + // Atomic add to global memory |
| 131 | + atomicAdd(&force[iat * 3 + 0], local_force_x); |
| 132 | + atomicAdd(&force[iat * 3 + 1], local_force_y); |
| 133 | + atomicAdd(&force[iat * 3 + 2], local_force_z); |
| 134 | +} |
16 | 135 |
|
17 | 136 | template <typename FPTYPE> |
18 | 137 | __global__ void cal_vkb1_nl( |
@@ -188,6 +307,67 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev |
188 | 307 | cudaCheckOnDebug(); |
189 | 308 | } |
190 | 309 |
|
| 310 | +// GPU operators |
| 311 | +template <typename FPTYPE> |
| 312 | +void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()( |
| 313 | + const base_device::DEVICE_GPU* ctx, |
| 314 | + const int& nat, |
| 315 | + const int& npw, |
| 316 | + const int& ntype, |
| 317 | + const FPTYPE* gcar, |
| 318 | + const FPTYPE* tau, |
| 319 | + const int* iat2it, |
| 320 | + const FPTYPE* vloc_per_type, |
| 321 | + const std::complex<FPTYPE>* aux, |
| 322 | + const FPTYPE& scale_factor, |
| 323 | + FPTYPE* force) |
| 324 | +{ |
| 325 | + // Calculate optimal grid configuration for GPU load balancing |
| 326 | + const int threads_per_block = THREADS_PER_BLOCK; |
| 327 | + const int max_blocks_per_sm = 32; // Configurable per GPU architecture |
| 328 | + const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); |
| 329 | + |
| 330 | + dim3 grid(max_blocks_x, nat); |
| 331 | + dim3 block(threads_per_block); |
| 332 | + |
| 333 | + cal_force_loc_sincos_kernel<FPTYPE><<<grid, block>>>( |
| 334 | + nat, npw, ntype, gcar, tau, iat2it, vloc_per_type, |
| 335 | + reinterpret_cast<const thrust::complex<FPTYPE>*>(aux), |
| 336 | + scale_factor, force |
| 337 | + ); |
| 338 | + |
| 339 | + cudaCheckOnDebug(); |
| 340 | +} |
| 341 | + |
| 342 | +template <typename FPTYPE> |
| 343 | +void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()( |
| 344 | + const base_device::DEVICE_GPU* ctx, |
| 345 | + const int& nat, |
| 346 | + const int& npw, |
| 347 | + const int& ig_gge0, |
| 348 | + const FPTYPE* gcar, |
| 349 | + const FPTYPE* tau, |
| 350 | + const int* iat2it, |
| 351 | + const FPTYPE* it_facts, |
| 352 | + const std::complex<FPTYPE>* aux, |
| 353 | + FPTYPE* force) |
| 354 | +{ |
| 355 | + // Calculate optimal grid configuration for GPU load balancing |
| 356 | + const int threads_per_block = THREADS_PER_BLOCK; |
| 357 | + const int max_blocks_per_sm = 32; // Configurable per GPU architecture |
| 358 | + const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); |
| 359 | + |
| 360 | + dim3 grid(max_blocks_x, nat); |
| 361 | + dim3 block(threads_per_block); |
| 362 | + |
| 363 | + cal_force_ew_sincos_kernel<FPTYPE><<<grid, block>>>( |
| 364 | + nat, npw, ig_gge0, gcar, tau, iat2it, it_facts, |
| 365 | + reinterpret_cast<const thrust::complex<FPTYPE>*>(aux), force |
| 366 | + ); |
| 367 | + |
| 368 | + cudaCheckOnDebug(); |
| 369 | +} |
| 370 | + |
191 | 371 | template <typename FPTYPE> |
192 | 372 | __global__ void cal_force_nl( |
193 | 373 | const int ntype, |
@@ -613,8 +793,12 @@ template void saveVkbValues<double>(const int *gcar_zero_ptrs, const std::comple |
613 | 793 |
|
614 | 794 | template struct cal_vkb1_nl_op<float, base_device::DEVICE_GPU>; |
615 | 795 | template struct cal_force_nl_op<float, base_device::DEVICE_GPU>; |
| 796 | +template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>; |
| 797 | +template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>; |
616 | 798 |
|
617 | 799 | template struct cal_vkb1_nl_op<double, base_device::DEVICE_GPU>; |
618 | 800 | template struct cal_force_nl_op<double, base_device::DEVICE_GPU>; |
| 801 | +template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>; |
| 802 | +template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>; |
619 | 803 |
|
620 | 804 | } // namespace hamilt |
0 commit comments