Skip to content

Commit 7f8adc8

Browse files
authored
Perf: optimize psir_dot function in gint_rho_gpu.cu (#4326)
* remove new in gtask_rho.cpp * rename some header files * optimize psir_dot in gint_rho_gpu.cu * fix a memory leak * remove redundant clear() * fix a memory leak * modify cuda_tools.cu * modify some header files * modify synchronization operation * modify psir_dot * modify psir_dot
1 parent 51bdd59 commit 7f8adc8

19 files changed

+97
-163
lines changed

source/module_hamilt_lcao/module_gint/gint.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#include "gint.h"
22

33
#if ((defined __CUDA))
4-
#include "gint_force.h"
5-
#include "gint_rho.h"
6-
#include "gint_vl.h"
4+
#include "gint_force_gpu.h"
5+
#include "gint_rho_gpu.h"
6+
#include "gint_vl_gpu.h"
77
#endif
88

99
#include "module_base/memory.h"

source/module_hamilt_lcao/module_gint/gint_force_gpu.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33
#include <fstream>
44
#include <sstream>
55

6-
#include "gint_force.h"
6+
#include "gint_force_gpu.h"
77
#include "kernels/cuda/cuda_tools.cuh"
88
#include "kernels/cuda/gint_force.cuh"
99
#include "module_base/ylm.h"
10-
#include "module_hamilt_lcao/module_gint/gint_tools.h"
10+
#include "gint_tools.h"
11+
1112
namespace GintKernel
1213
{
1314

source/module_hamilt_lcao/module_gint/gint_force.h renamed to source/module_hamilt_lcao/module_gint/gint_force_gpu.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
#ifndef GINT_FORCE_H
2-
#define GINT_FORCE_H
1+
#ifndef GINT_FORCE_GPU_H
2+
#define GINT_FORCE_GPU_H
33

44
#include "module_hamilt_lcao/module_gint/gint.h"
55
#include "module_hamilt_lcao/module_gint/grid_technique.h"

source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu

Lines changed: 17 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
#include "kernels/cuda/cuda_tools.cuh"
22
#include "module_base/ylm.h"
3-
#include "module_hamilt_lcao/module_gint/gint_rho.h"
4-
#include "module_hamilt_lcao/module_gint/gint_tools.h"
5-
#include "module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cuh"
3+
#include "gint_rho_gpu.h"
4+
#include "gint_tools.h"
5+
#include "kernels/cuda/gint_rho.cuh"
66
#include "omp.h"
77

88
#include <omp.h>
@@ -68,7 +68,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
6868
{
6969
checkCuda(cudaStreamSynchronize(gridt.streams[i]));
7070
}
71-
7271
// calculate the rho for every nbz bigcells
7372

7473
#pragma omp parallel for num_threads(gridt.nstreams) collapse(2)
@@ -78,6 +77,7 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
7877
{
7978
// get stream id
8079
int stream_num = omp_get_thread_num();
80+
checkCuda(cudaStreamSynchronize(gridt.streams[stream_num]));
8181

8282
// psi_input contains data used to generate the psi values.
8383
// The suffix "_g" indicates that the data is stored in the GPU,
@@ -154,16 +154,9 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
154154
double* rho_g = gridt.rho_g;
155155

156156
// variables for dot product psir * psir_dm
157-
int dot_count = 0;
158-
int* vec_len = &gridt.vec_len[gridt.num_mcell * stream_num];
159-
double** vec_l = &gridt.vec_l[gridt.num_mcell * stream_num];
160-
double** vec_r = &gridt.vec_r[gridt.num_mcell * stream_num];
161157
double** dot_product
162158
= &gridt.dot_product[gridt.num_mcell * stream_num];
163159

164-
int* vec_len_g = &gridt.vec_len_g[gridt.num_mcell * stream_num];
165-
double** vec_l_g = &gridt.vec_l_g[gridt.num_mcell * stream_num];
166-
double** vec_r_g = &gridt.vec_r_g[gridt.num_mcell * stream_num];
167160
double** dot_product_g
168161
= &gridt.dot_product_g[gridt.num_mcell * stream_num];
169162

@@ -172,7 +165,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
172165
int atom_pair_num = 0;
173166
const int grid_index_ij = i * gridt.nby * gridt.nbzp + j * gridt.nbzp;
174167
std::vector<bool> gpu_matrix_cal_flag(max_size * gridt.nbzp,false);
175-
checkCuda(cudaStreamSynchronize(gridt.streams[stream_num]));
176168

177169
// generate GPU tasks, including the calculation of psir, matrix
178170
// multiplication, and dot product
@@ -211,11 +203,7 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
211203
max_n,
212204
atom_pair_num,
213205
rho_g,
214-
vec_l,
215-
vec_r,
216-
dot_product,
217-
vec_len,
218-
dot_count);
206+
dot_product);
219207

220208
// Copying data from host to device
221209
checkCuda(cudaMemcpyAsync(input_double_g,
@@ -286,21 +274,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
286274
cudaMemcpyHostToDevice,
287275
gridt.streams[stream_num]));
288276

289-
checkCuda(cudaMemcpyAsync(vec_len_g,
290-
vec_len,
291-
gridt.num_mcell * sizeof(int),
292-
cudaMemcpyHostToDevice,
293-
gridt.streams[stream_num]));
294-
checkCuda(cudaMemcpyAsync(vec_l_g,
295-
vec_l,
296-
gridt.num_mcell * sizeof(double*),
297-
cudaMemcpyHostToDevice,
298-
gridt.streams[stream_num]));
299-
checkCuda(cudaMemcpyAsync(vec_r_g,
300-
vec_r,
301-
gridt.num_mcell * sizeof(double*),
302-
cudaMemcpyHostToDevice,
303-
gridt.streams[stream_num]));
304277
checkCuda(cudaMemcpyAsync(dot_product_g,
305278
dot_product,
306279
gridt.num_mcell * sizeof(double*),
@@ -352,20 +325,20 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
352325
atom_pair_num,
353326
gridt.streams[stream_num],
354327
ap_alpha_g);
328+
checkCudaLastError();
355329

356330
// Launching kernel to calculate dot product psir * psir_dm
357-
dim3 grid_dot(64);
358-
dim3 block_dot(64);
359-
int incx = 1;
360-
int incy = 1;
361-
psir_dot<<<grid_dot, block_dot, 0, gridt.streams[stream_num]>>>(
362-
vec_len_g,
363-
vec_l_g,
364-
incx,
365-
vec_r_g,
366-
incy,
367-
dot_product_g,
368-
dot_count);
331+
const int block_size = 128;
332+
dim3 block_dot(block_size);
333+
dim3 grid_dot(gridt.nbzp, gridt.bxyz);
334+
psir_dot<<<grid_dot, block_dot, sizeof(double) * block_size, gridt.streams[stream_num]>>>(
335+
gridt.nbzp,
336+
gridt.bxyz,
337+
max_size * ucell.nwmax,
338+
psir_ylm_left_g,
339+
psir_r_g,
340+
dot_product_g);
341+
checkCudaLastError();
369342
}
370343
}
371344

@@ -374,13 +347,11 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
374347
{
375348
checkCuda(cudaStreamSynchronize(gridt.streams[i]));
376349
}
377-
378350
// Copy rho from device to host
379351
checkCuda(cudaMemcpy(rho,
380352
gridt.rho_g,
381353
nczp * gridt.ncx * gridt.ncy * sizeof(double),
382354
cudaMemcpyDeviceToHost));
383-
384355
// free the memory
385356
checkCuda(cudaFree(dm_matrix_g));
386357
}

source/module_hamilt_lcao/module_gint/gint_rho.h renamed to source/module_hamilt_lcao/module_gint/gint_rho_gpu.h

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
#include "module_hamilt_lcao/module_gint/gint.h"
88
#include "module_hamilt_lcao/module_gint/grid_technique.h"
99

10-
cudaError_t checkCuda(cudaError_t result);
1110
namespace GintKernel
1211
{
1312

@@ -90,11 +89,7 @@ void gtask_rho(const Grid_Technique& gridt,
9089
* @param max_n Maximum value of n.
9190
* @param atom_pair_num Total count of atom pairs, which is also the number of matrix multiplication operations.
9291
* @param rho_g Rho.
93-
* @param vec_l Pointers to psir_ylm for vector dot product.
94-
* @param vec_r Pointers to psir_dm for vector dot product.
9592
* @param dot_product Pointers to the results of dot products.
96-
* @param vec_len Vector lengths for each dot product.
97-
* @param dot_count Total count of dot products.
9893
*/
9994
void alloc_mult_dot_rho(const Grid_Technique& gridt,
10095
const UnitCell& ucell,
@@ -120,11 +115,7 @@ void alloc_mult_dot_rho(const Grid_Technique& gridt,
120115
int& max_n,
121116
int& atom_pair_num,
122117
double* rho_g,
123-
double** vec_l,
124-
double** vec_r,
125-
double** dot_product,
126-
int* vec_len,
127-
int& dot_count);
118+
double** dot_product);
128119

129120
} // namespace GintKernel
130121
#endif

source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22

33
#include "kernels/cuda/cuda_tools.cuh"
44
#include "module_base/ylm.h"
5-
#include "module_hamilt_lcao/module_gint/gint_tools.h"
6-
#include "module_hamilt_lcao/module_gint/gint_vl.h"
7-
#include "module_hamilt_lcao/module_gint/kernels/cuda/gint_vl.cuh"
5+
#include "gint_tools.h"
6+
#include "gint_vl_gpu.h"
7+
#include "kernels/cuda/gint_vl.cuh"
88

99
namespace GintKernel
1010
{

source/module_hamilt_lcao/module_gint/gint_vl.h renamed to source/module_hamilt_lcao/module_gint/gint_vl_gpu.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
1-
#ifndef GINT_VL_H
2-
#define GINT_VL_H
1+
#ifndef GINT_VL_GPU_H
2+
#define GINT_VL_GPU_H
33
#include <cublas_v2.h>
44
#include <cuda.h> // for CUDA_VERSION
55
#include <cuda_runtime.h>
66

77
#include "module_hamilt_lcao/module_gint/gint.h"
88
#include "module_hamilt_lcao/module_gint/grid_technique.h"
99

10-
cudaError_t checkCuda(cudaError_t result);
11-
1210
namespace GintKernel
1311
{
1412

source/module_hamilt_lcao/module_gint/grid_technique.cpp

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ void Grid_Technique::set_pbc_grid(const int& ncx_in,
185185
#if ((defined __CUDA) /* || (defined __ROCM) */)
186186
if(GlobalV::device_flag == "gpu")
187187
{
188-
this->init_gpu_gint_variables(ucell,orb,num_stream);
188+
this->init_gpu_gint_variables(ucell, orb, num_stream);
189189
}
190190
#endif
191191

@@ -923,22 +923,11 @@ void Grid_Technique::init_gpu_gint_variables(const UnitCell& ucell,const LCAO_Or
923923
num_mcell = nbzp * bxyz;
924924
checkCudaErrors(cudaMalloc((void**)&rho_g, this->ncxyz * sizeof(double)));
925925
checkCudaErrors(cudaMemset(rho_g, 0, this->ncxyz * sizeof(double)));
926-
checkCudaErrors(
927-
cudaMallocHost((void**)&vec_l, num_mcell * nstreams * sizeof(double*)));
928-
checkCudaErrors(
929-
cudaMalloc((void**)&vec_l_g, num_mcell * nstreams * sizeof(double*)));
930-
checkCudaErrors(
931-
cudaMallocHost((void**)&vec_r, num_mcell * nstreams * sizeof(double*)));
932-
checkCudaErrors(
933-
cudaMalloc((void**)&vec_r_g, num_mcell * nstreams * sizeof(double*)));
934926
checkCudaErrors(cudaMallocHost((void**)&dot_product,
935927
num_mcell * nstreams * sizeof(double*)));
936928
checkCudaErrors(cudaMalloc((void**)&dot_product_g,
937929
num_mcell * nstreams * sizeof(double*)));
938-
checkCudaErrors(
939-
cudaMallocHost((void**)&vec_len, num_mcell * nstreams * sizeof(int)));
940-
checkCudaErrors(
941-
cudaMalloc((void**)&vec_len_g, num_mcell * nstreams * sizeof(int)));
930+
942931

943932
for (int i = 0; i < nstreams; ++i)
944933
{
@@ -961,7 +950,10 @@ void Grid_Technique::free_gpu_gint_variables(int nat)
961950
return;
962951
}
963952
for (int i = 0; i < nstreams; ++i)
953+
{
964954
checkCudaErrors(cudaStreamDestroy(streams[i]));
955+
}
956+
delete[] streams;
965957

966958
checkCudaErrors(cudaFree(ylmcoef_g));
967959
checkCudaErrors(cudaFree(atom_nwl_g));
@@ -1020,14 +1012,8 @@ void Grid_Technique::free_gpu_gint_variables(int nat)
10201012
checkCudaErrors(cudaFree(dm_global_g));
10211013
checkCudaErrors(cudaFree(ap_output_gbl_g));
10221014

1023-
checkCudaErrors(cudaFreeHost(vec_len));
1024-
checkCudaErrors(cudaFreeHost(vec_l));
1025-
checkCudaErrors(cudaFreeHost(vec_r));
10261015
checkCudaErrors(cudaFreeHost(dot_product));
10271016

1028-
checkCudaErrors(cudaFree(vec_len_g));
1029-
checkCudaErrors(cudaFree(vec_l_g));
1030-
checkCudaErrors(cudaFree(vec_r_g));
10311017
checkCudaErrors(cudaFree(dot_product_g));
10321018
checkCudaErrors(cudaFree(rho_g));
10331019

source/module_hamilt_lcao/module_gint/grid_technique.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -218,12 +218,6 @@ class Grid_Technique : public Grid_MeshBall
218218
// additional variables for rho calculating
219219
int num_mcell;
220220
double* rho_g;
221-
int* vec_len;
222-
int* vec_len_g;
223-
double** vec_l;
224-
double** vec_l_g;
225-
double** vec_r;
226-
double** vec_r_g;
227221
double** dot_product;
228222
double** dot_product_g;
229223

source/module_hamilt_lcao/module_gint/gtask_force.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#include <omp.h>
22

3-
#include "gint_force.h"
3+
#include "gint_force_gpu.h"
44
#include "module_base/ylm.h"
55
#include "module_hamilt_lcao/module_gint/gint_tools.h"
66
namespace GintKernel
@@ -213,7 +213,6 @@ void alloc_mult_force(const Grid_Technique& gridt,
213213
}
214214
}
215215
atom_pair_num = tid;
216-
gpu_mat_cal_flag.clear();
217216
}
218217

219218

0 commit comments

Comments
 (0)