Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 22 additions & 7 deletions source/module_hamilt_lcao/module_gint/gint_force_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,11 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
const int num_streams = gridt.nstreams;

std::vector<cudaStream_t> streams(num_streams);
std::vector<cudaEvent_t> events(num_streams);
for (int i = 0; i < num_streams; i++)
{
checkCuda(cudaStreamCreate(&streams[i]));
checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming));
}

Cuda_Mem_Wrapper<double> dr_part(3 * max_atom_per_z, num_streams, true);
Expand Down Expand Up @@ -89,21 +91,32 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
dm->get_wrapper(),
dm->get_nnr() * sizeof(double),
cudaMemcpyHostToDevice));

#ifdef _OPENMP
#pragma omp parallel for num_threads(num_streams) collapse(2)
const int max_thread_num = std::min(omp_get_max_threads(), num_streams);
#endif
#pragma omp parallel num_threads(max_thread_num)
{
#ifdef _OPENMP
const int tid = omp_get_thread_num();
const int num_threads = omp_get_num_threads();
const int sid_start = tid * num_streams / num_threads;
const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads;
#else
const int sid_start = 0;
const int thread_num_streams = num_streams;
#endif
#pragma omp for collapse(2) schedule(dynamic)
for (int i = 0; i < gridt.nbx; i++)
{
for (int j = 0; j < gridt.nby; j++)
{
// 20240620 Note that it must be set again here because
// cuda's device is not safe in a multi-threaded environment.
checkCuda(cudaSetDevice(gridt.dev_id));
#ifdef _OPENMP
const int sid = omp_get_thread_num();
#else
const int sid = 0;
#endif

const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start;
checkCuda(cudaEventSynchronize(events[sid]));

int max_m = 0;
int max_n = 0;
Expand Down Expand Up @@ -161,6 +174,7 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
checkCuda(cudaEventRecord(events[sid], streams[sid]));

psi.memset_device_async(streams[sid], sid, 0);
psi_dm.memset_device_async(streams[sid], sid, 0);
Expand Down Expand Up @@ -241,9 +255,9 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
stress.get_device_pointer(sid));
checkCudaLastError();
}
checkCuda(cudaStreamSynchronize(streams[sid]));
}
}
}

for(int i = 0; i < num_streams; i++)
{
Expand All @@ -254,6 +268,7 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
for (int i = 0; i < num_streams; i++)
{
checkCuda(cudaStreamSynchronize(streams[i]));
checkCuda(cudaEventDestroy(events[i]));
}

if (isstress){
Expand Down
29 changes: 21 additions & 8 deletions source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,11 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
const int max_atompair_per_z = max_atom * max_atom * nbzp;

std::vector<cudaStream_t> streams(num_streams);
std::vector<cudaEvent_t> events(num_streams);
for (int i = 0; i < num_streams; i++)
{
checkCuda(cudaStreamCreate(&streams[i]));
checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming));
}

Cuda_Mem_Wrapper<double> dr_part(max_atom_per_z * 3, num_streams, true);
Expand Down Expand Up @@ -71,8 +73,20 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,

// calculate the rho for every nbzp bigcells
#ifdef _OPENMP
#pragma omp parallel for num_threads(num_streams) collapse(2)
const int max_thread_num = std::min(omp_get_max_threads(), num_streams);
#endif
#pragma omp parallel num_threads(max_thread_num)
{
#ifdef _OPENMP
const int tid = omp_get_thread_num();
const int num_threads = omp_get_num_threads();
const int sid_start = tid * num_streams / num_threads;
const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads;
#else
const int sid_start = 0;
const int thread_num_streams = num_streams;
#endif
#pragma omp for collapse(2) schedule(dynamic)
for (int i = 0; i < gridt.nbx; i++)
{
for (int j = 0; j < gridt.nby; j++)
Expand All @@ -81,12 +95,9 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
// cuda's device is not safe in a multi-threaded environment.

checkCuda(cudaSetDevice(gridt.dev_id));
// get stream id
#ifdef _OPENMP
const int sid = omp_get_thread_num();
#else
const int sid = 0;
#endif

const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start;
checkCuda(cudaEventSynchronize(events[sid]));

int max_m = 0;
int max_n = 0;
Expand Down Expand Up @@ -147,6 +158,7 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
dot_product.copy_host_to_device_async(streams[sid], sid);
checkCuda(cudaEventRecord(events[sid], streams[sid]));

psi.memset_device_async(streams[sid], sid, 0);
psi_dm.memset_device_async(streams[sid], sid, 0);
Expand Down Expand Up @@ -203,9 +215,9 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
psi_dm.get_device_pointer(sid),
dot_product.get_device_pointer(sid));
checkCudaLastError();
checkCuda(cudaStreamSynchronize(streams[sid]));
}
}
}

// Copy rho from device to host
checkCuda(cudaMemcpy(rho,
Expand All @@ -216,6 +228,7 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
for (int i = 0; i < num_streams; i++)
{
checkCuda(cudaStreamDestroy(streams[i]));
checkCuda(cudaEventDestroy(events[i]));
}
}
} // namespace GintKernel
27 changes: 20 additions & 7 deletions source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,12 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
const double vfactor = ucell.omega / gridt.ncxyz;
const int nczp = nbzp * gridt.bz;
std::vector<cudaStream_t> streams(num_streams);
std::vector<cudaEvent_t> events(num_streams);

for (int i = 0; i < num_streams; i++)
{
checkCuda(cudaStreamCreate(&streams[i]));
checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming));
}

const int nnrg = hRGint->get_nnr();
Expand Down Expand Up @@ -73,21 +75,30 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
Cuda_Mem_Wrapper<double*> gemm_C(max_atompair_per_z, num_streams, true);

#ifdef _OPENMP
#pragma omp parallel for num_threads(num_streams) collapse(2)
const int max_thread_num = std::min(omp_get_max_threads(), num_streams);
#endif
#pragma omp parallel num_threads(max_thread_num)
{
#ifdef _OPENMP
const int tid = omp_get_thread_num();
const int num_threads = omp_get_num_threads();
const int sid_start = tid * num_streams / num_threads;
const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads;
#else
const int sid_start = 0;
const int thread_num_streams = num_streams;
#endif
#pragma omp for collapse(2) schedule(dynamic)
for (int i = 0; i < gridt.nbx; i++)
{
for (int j = 0; j < gridt.nby; j++)
{
// 20240620 Note that it must be set again here because
// cuda's device is not safe in a multi-threaded environment.
checkCuda(cudaSetDevice(gridt.dev_id));
#ifdef _OPENMP
const int sid = omp_get_thread_num();
#else
const int sid = 0;
#endif

const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start;
checkCuda(cudaEventSynchronize(events[sid]));
int max_m = 0;
int max_n = 0;
int atom_pair_num = 0;
Expand Down Expand Up @@ -141,6 +152,7 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
checkCuda(cudaEventRecord(events[sid], streams[sid]));

psi.memset_device_async(streams[sid], sid, 0);
psi_vldr3.memset_device_async(streams[sid], sid, 0);
Expand Down Expand Up @@ -187,9 +199,9 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
streams[sid],
nullptr);
checkCudaLastError();
checkCuda(cudaStreamSynchronize(streams[sid]));
}
}
}

checkCuda(cudaMemcpy(
hRGint->get_wrapper(),
Expand All @@ -200,6 +212,7 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
for (int i = 0; i < num_streams; i++)
{
checkCuda(cudaStreamDestroy(streams[i]));
checkCuda(cudaEventDestroy(events[i]));
}
}

Expand Down
Loading