11#include " kernels/cuda/cuda_tools.cuh"
22#include " module_base/ylm.h"
3- #include " module_hamilt_lcao/module_gint/gint_rho .h"
4- #include " module_hamilt_lcao/module_gint/ gint_tools.h"
5- #include " module_hamilt_lcao/module_gint/ kernels/cuda/gint_rho.cuh"
3+ #include " gint_rho_gpu .h"
4+ #include " gint_tools.h"
5+ #include " kernels/cuda/gint_rho.cuh"
66#include " omp.h"
77
88#include < omp.h>
@@ -68,7 +68,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
6868 {
6969 checkCuda (cudaStreamSynchronize (gridt.streams [i]));
7070 }
71-
7271 // calculate the rho for every nbz bigcells
7372
7473#pragma omp parallel for num_threads(gridt.nstreams) collapse(2)
@@ -78,6 +77,7 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
7877 {
7978 // get stream id
8079 int stream_num = omp_get_thread_num ();
80+ checkCuda (cudaStreamSynchronize (gridt.streams [stream_num]));
8181
8282 // psi_input contains data used to generate the psi values.
8383 // The suffix "_g" indicates that the data is stored in the GPU,
@@ -154,16 +154,9 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
154154 double * rho_g = gridt.rho_g ;
155155
156156 // variables for dot product psir * psir_dm
157- int dot_count = 0 ;
158- int * vec_len = &gridt.vec_len [gridt.num_mcell * stream_num];
159- double ** vec_l = &gridt.vec_l [gridt.num_mcell * stream_num];
160- double ** vec_r = &gridt.vec_r [gridt.num_mcell * stream_num];
161157 double ** dot_product
162158 = &gridt.dot_product [gridt.num_mcell * stream_num];
163159
164- int * vec_len_g = &gridt.vec_len_g [gridt.num_mcell * stream_num];
165- double ** vec_l_g = &gridt.vec_l_g [gridt.num_mcell * stream_num];
166- double ** vec_r_g = &gridt.vec_r_g [gridt.num_mcell * stream_num];
167160 double ** dot_product_g
168161 = &gridt.dot_product_g [gridt.num_mcell * stream_num];
169162
@@ -172,7 +165,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
172165 int atom_pair_num = 0 ;
173166 const int grid_index_ij = i * gridt.nby * gridt.nbzp + j * gridt.nbzp ;
174167 std::vector<bool > gpu_matrix_cal_flag (max_size * gridt.nbzp ,false );
175- checkCuda (cudaStreamSynchronize (gridt.streams [stream_num]));
176168
177169 // generate GPU tasks, including the calculation of psir, matrix
178170 // multiplication, and dot product
@@ -211,11 +203,7 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
211203 max_n,
212204 atom_pair_num,
213205 rho_g,
214- vec_l,
215- vec_r,
216- dot_product,
217- vec_len,
218- dot_count);
206+ dot_product);
219207
220208 // Copying data from host to device
221209 checkCuda (cudaMemcpyAsync (input_double_g,
@@ -286,21 +274,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
286274 cudaMemcpyHostToDevice,
287275 gridt.streams [stream_num]));
288276
289- checkCuda (cudaMemcpyAsync (vec_len_g,
290- vec_len,
291- gridt.num_mcell * sizeof (int ),
292- cudaMemcpyHostToDevice,
293- gridt.streams [stream_num]));
294- checkCuda (cudaMemcpyAsync (vec_l_g,
295- vec_l,
296- gridt.num_mcell * sizeof (double *),
297- cudaMemcpyHostToDevice,
298- gridt.streams [stream_num]));
299- checkCuda (cudaMemcpyAsync (vec_r_g,
300- vec_r,
301- gridt.num_mcell * sizeof (double *),
302- cudaMemcpyHostToDevice,
303- gridt.streams [stream_num]));
304277 checkCuda (cudaMemcpyAsync (dot_product_g,
305278 dot_product,
306279 gridt.num_mcell * sizeof (double *),
@@ -352,20 +325,20 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
352325 atom_pair_num,
353326 gridt.streams [stream_num],
354327 ap_alpha_g);
328+ checkCudaLastError ();
355329
356330 // Launching kernel to calculate dot product psir * psir_dm
357- dim3 grid_dot (64 );
358- dim3 block_dot (64 );
359- int incx = 1 ;
360- int incy = 1 ;
361- psir_dot<<<grid_dot, block_dot, 0 , gridt.streams[stream_num]>>> (
362- vec_len_g,
363- vec_l_g,
364- incx,
365- vec_r_g,
366- incy,
367- dot_product_g,
368- dot_count);
331+ const int block_size = 128 ;
332+ dim3 block_dot (block_size);
333+ dim3 grid_dot (gridt.nbzp , gridt.bxyz );
334+ psir_dot<<<grid_dot, block_dot, sizeof (double ) * block_size, gridt.streams[stream_num]>>> (
335+ gridt.nbzp ,
336+ gridt.bxyz ,
337+ max_size * ucell.nwmax ,
338+ psir_ylm_left_g,
339+ psir_r_g,
340+ dot_product_g);
341+ checkCudaLastError ();
369342 }
370343 }
371344
@@ -374,13 +347,11 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
374347 {
375348 checkCuda (cudaStreamSynchronize (gridt.streams [i]));
376349 }
377-
378350 // Copy rho from device to host
379351 checkCuda (cudaMemcpy (rho,
380352 gridt.rho_g ,
381353 nczp * gridt.ncx * gridt.ncy * sizeof (double ),
382354 cudaMemcpyDeviceToHost));
383-
384355 // free the memory
385356 checkCuda (cudaFree (dm_matrix_g));
386357}
0 commit comments