diff --git a/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu b/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu index d15dc8416a..9cb3e785c4 100644 --- a/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu +++ b/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu @@ -51,9 +51,11 @@ void gint_fvl_gpu(const hamilt::HContainer* dm, const int num_streams = gridt.nstreams; std::vector streams(num_streams); + std::vector events(num_streams); for (int i = 0; i < num_streams; i++) { checkCuda(cudaStreamCreate(&streams[i])); + checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming)); } Cuda_Mem_Wrapper dr_part(3 * max_atom_per_z, num_streams, true); @@ -89,9 +91,22 @@ void gint_fvl_gpu(const hamilt::HContainer* dm, dm->get_wrapper(), dm->get_nnr() * sizeof(double), cudaMemcpyHostToDevice)); + #ifdef _OPENMP - #pragma omp parallel for num_threads(num_streams) collapse(2) +const int max_thread_num = std::min(omp_get_max_threads(), num_streams); +#endif +#pragma omp parallel num_threads(max_thread_num) +{ +#ifdef _OPENMP + const int tid = omp_get_thread_num(); + const int num_threads = omp_get_num_threads(); + const int sid_start = tid * num_streams / num_threads; + const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads; +#else + const int sid_start = 0; + const int thread_num_streams = num_streams; #endif +#pragma omp for collapse(2) schedule(dynamic) for (int i = 0; i < gridt.nbx; i++) { for (int j = 0; j < gridt.nby; j++) @@ -99,11 +114,9 @@ void gint_fvl_gpu(const hamilt::HContainer* dm, // 20240620 Note that it must be set again here because // cuda's device is not safe in a multi-threaded environment. checkCuda(cudaSetDevice(gridt.dev_id)); -#ifdef _OPENMP - const int sid = omp_get_thread_num(); -#else - const int sid = 0; -#endif + + const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start; + checkCuda(cudaEventSynchronize(events[sid])); int max_m = 0; int max_n = 0; @@ -161,6 +174,7 @@ void gint_fvl_gpu(const hamilt::HContainer* dm, gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num); gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num); gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num); + checkCuda(cudaEventRecord(events[sid], streams[sid])); psi.memset_device_async(streams[sid], sid, 0); psi_dm.memset_device_async(streams[sid], sid, 0); @@ -241,9 +255,9 @@ void gint_fvl_gpu(const hamilt::HContainer* dm, stress.get_device_pointer(sid)); checkCudaLastError(); } - checkCuda(cudaStreamSynchronize(streams[sid])); } } +} for(int i = 0; i < num_streams; i++) { @@ -254,6 +268,7 @@ void gint_fvl_gpu(const hamilt::HContainer* dm, for (int i = 0; i < num_streams; i++) { checkCuda(cudaStreamSynchronize(streams[i])); + checkCuda(cudaEventDestroy(events[i])); } if (isstress){ diff --git a/source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu b/source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu index 0fb6accad4..fc5fba03c2 100644 --- a/source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu +++ b/source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu @@ -34,9 +34,11 @@ void gint_rho_gpu(const hamilt::HContainer* dm, const int max_atompair_per_z = max_atom * max_atom * nbzp; std::vector streams(num_streams); + std::vector events(num_streams); for (int i = 0; i < num_streams; i++) { checkCuda(cudaStreamCreate(&streams[i])); + checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming)); } Cuda_Mem_Wrapper dr_part(max_atom_per_z * 3, num_streams, true); @@ -71,8 +73,20 @@ void gint_rho_gpu(const hamilt::HContainer* dm, // calculate the rho for every nbzp bigcells #ifdef _OPENMP -#pragma omp parallel for num_threads(num_streams) collapse(2) +const int max_thread_num = std::min(omp_get_max_threads(), num_streams); #endif +#pragma omp parallel num_threads(max_thread_num) +{ +#ifdef _OPENMP + const int tid = omp_get_thread_num(); + const int num_threads = omp_get_num_threads(); + const int sid_start = tid * num_streams / num_threads; + const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads; +#else + const int sid_start = 0; + const int thread_num_streams = num_streams; +#endif +#pragma omp for collapse(2) schedule(dynamic) for (int i = 0; i < gridt.nbx; i++) { for (int j = 0; j < gridt.nby; j++) @@ -81,12 +95,9 @@ void gint_rho_gpu(const hamilt::HContainer* dm, // cuda's device is not safe in a multi-threaded environment. checkCuda(cudaSetDevice(gridt.dev_id)); - // get stream id -#ifdef _OPENMP - const int sid = omp_get_thread_num(); -#else - const int sid = 0; -#endif + + const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start; + checkCuda(cudaEventSynchronize(events[sid])); int max_m = 0; int max_n = 0; @@ -147,6 +158,7 @@ void gint_rho_gpu(const hamilt::HContainer* dm, gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num); gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num); dot_product.copy_host_to_device_async(streams[sid], sid); + checkCuda(cudaEventRecord(events[sid], streams[sid])); psi.memset_device_async(streams[sid], sid, 0); psi_dm.memset_device_async(streams[sid], sid, 0); @@ -203,9 +215,9 @@ void gint_rho_gpu(const hamilt::HContainer* dm, psi_dm.get_device_pointer(sid), dot_product.get_device_pointer(sid)); checkCudaLastError(); - checkCuda(cudaStreamSynchronize(streams[sid])); } } +} // Copy rho from device to host checkCuda(cudaMemcpy(rho, @@ -216,6 +228,7 @@ void gint_rho_gpu(const hamilt::HContainer* dm, for (int i = 0; i < num_streams; i++) { checkCuda(cudaStreamDestroy(streams[i])); + checkCuda(cudaEventDestroy(events[i])); } } } // namespace GintKernel diff --git a/source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu b/source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu index bf60ac8d7b..c58896f34c 100644 --- a/source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu +++ b/source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu @@ -41,10 +41,12 @@ void gint_vl_gpu(hamilt::HContainer* hRGint, const double vfactor = ucell.omega / gridt.ncxyz; const int nczp = nbzp * gridt.bz; std::vector streams(num_streams); + std::vector events(num_streams); for (int i = 0; i < num_streams; i++) { checkCuda(cudaStreamCreate(&streams[i])); + checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming)); } const int nnrg = hRGint->get_nnr(); @@ -73,8 +75,20 @@ void gint_vl_gpu(hamilt::HContainer* hRGint, Cuda_Mem_Wrapper gemm_C(max_atompair_per_z, num_streams, true); #ifdef _OPENMP -#pragma omp parallel for num_threads(num_streams) collapse(2) +const int max_thread_num = std::min(omp_get_max_threads(), num_streams); #endif +#pragma omp parallel num_threads(max_thread_num) +{ +#ifdef _OPENMP + const int tid = omp_get_thread_num(); + const int num_threads = omp_get_num_threads(); + const int sid_start = tid * num_streams / num_threads; + const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads; +#else + const int sid_start = 0; + const int thread_num_streams = num_streams; +#endif +#pragma omp for collapse(2) schedule(dynamic) for (int i = 0; i < gridt.nbx; i++) { for (int j = 0; j < gridt.nby; j++) @@ -82,12 +96,9 @@ void gint_vl_gpu(hamilt::HContainer* hRGint, // 20240620 Note that it must be set again here because // cuda's device is not safe in a multi-threaded environment. checkCuda(cudaSetDevice(gridt.dev_id)); -#ifdef _OPENMP - const int sid = omp_get_thread_num(); -#else - const int sid = 0; -#endif + const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start; + checkCuda(cudaEventSynchronize(events[sid])); int max_m = 0; int max_n = 0; int atom_pair_num = 0; @@ -141,6 +152,7 @@ void gint_vl_gpu(hamilt::HContainer* hRGint, gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num); gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num); gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num); + checkCuda(cudaEventRecord(events[sid], streams[sid])); psi.memset_device_async(streams[sid], sid, 0); psi_vldr3.memset_device_async(streams[sid], sid, 0); @@ -187,9 +199,9 @@ void gint_vl_gpu(hamilt::HContainer* hRGint, streams[sid], nullptr); checkCudaLastError(); - checkCuda(cudaStreamSynchronize(streams[sid])); } } +} checkCuda(cudaMemcpy( hRGint->get_wrapper(), @@ -200,6 +212,7 @@ void gint_vl_gpu(hamilt::HContainer* hRGint, for (int i = 0; i < num_streams; i++) { checkCuda(cudaStreamDestroy(streams[i])); + checkCuda(cudaEventDestroy(events[i])); } }