Skip to content

Commit b3b948c

Browse files
committed
optimize stream strategy
1 parent a2ec5d1 commit b3b948c

File tree

3 files changed

+54
-22
lines changed

3 files changed

+54
-22
lines changed

source/module_hamilt_lcao/module_gint/gint_force_gpu.cu

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,11 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
5151
const int num_streams = gridt.nstreams;
5252

5353
std::vector<cudaStream_t> streams(num_streams);
54+
std::vector<cudaEvent_t> events(num_streams);
5455
for (int i = 0; i < num_streams; i++)
5556
{
5657
checkCuda(cudaStreamCreate(&streams[i]));
58+
checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming));
5759
}
5860

5961
Cuda_Mem_Wrapper<double> dr_part(3 * max_atom_per_z, num_streams, true);
@@ -89,21 +91,29 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
8991
dm->get_wrapper(),
9092
dm->get_nnr() * sizeof(double),
9193
cudaMemcpyHostToDevice));
94+
95+
#pragma omp parallel num_threads(num_streams)
96+
{
9297
#ifdef _OPENMP
93-
#pragma omp parallel for num_threads(num_streams) collapse(2)
98+
const int tid = omp_get_thread_num();
99+
const int num_threads = omp_get_num_threads();
100+
const int sid_start = tid * num_streams / num_threads;
101+
const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads;
102+
#else
103+
const int sid_start = 0;
104+
const int thread_num_streams = num_streams;
94105
#endif
106+
#pragma omp for collapse(2) schedule(dynamic)
95107
for (int i = 0; i < gridt.nbx; i++)
96108
{
97109
for (int j = 0; j < gridt.nby; j++)
98110
{
99111
// 20240620 Note that it must be set again here because
100112
// cuda's device is not safe in a multi-threaded environment.
101113
checkCuda(cudaSetDevice(gridt.dev_id));
102-
#ifdef _OPENMP
103-
const int sid = omp_get_thread_num();
104-
#else
105-
const int sid = 0;
106-
#endif
114+
115+
const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start;
116+
checkCuda(cudaEventSynchronize(events[sid]));
107117

108118
int max_m = 0;
109119
int max_n = 0;
@@ -161,6 +171,7 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
161171
gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
162172
gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
163173
gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
174+
checkCuda(cudaEventRecord(events[sid], streams[sid]));
164175

165176
psi.memset_device_async(streams[sid], sid, 0);
166177
psi_dm.memset_device_async(streams[sid], sid, 0);
@@ -241,9 +252,9 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
241252
stress.get_device_pointer(sid));
242253
checkCudaLastError();
243254
}
244-
checkCuda(cudaStreamSynchronize(streams[sid]));
245255
}
246256
}
257+
}
247258

248259
for(int i = 0; i < num_streams; i++)
249260
{
@@ -254,6 +265,7 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
254265
for (int i = 0; i < num_streams; i++)
255266
{
256267
checkCuda(cudaStreamSynchronize(streams[i]));
268+
checkCuda(cudaEventDestroy(events[i]));
257269
}
258270

259271
if (isstress){

source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,11 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
3434
const int max_atompair_per_z = max_atom * max_atom * nbzp;
3535

3636
std::vector<cudaStream_t> streams(num_streams);
37+
std::vector<cudaEvent_t> events(num_streams);
3738
for (int i = 0; i < num_streams; i++)
3839
{
3940
checkCuda(cudaStreamCreate(&streams[i]));
41+
checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming));
4042
}
4143

4244
Cuda_Mem_Wrapper<double> dr_part(max_atom_per_z * 3, num_streams, true);
@@ -70,9 +72,18 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
7072
cudaMemcpyHostToDevice));
7173

7274
// calculate the rho for every nbzp bigcells
75+
#pragma omp parallel num_threads(num_streams)
76+
{
7377
#ifdef _OPENMP
74-
#pragma omp parallel for num_threads(num_streams) collapse(2)
78+
const int tid = omp_get_thread_num();
79+
const int num_threads = omp_get_num_threads();
80+
const int sid_start = tid * num_streams / num_threads;
81+
const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads;
82+
#else
83+
const int sid_start = 0;
84+
const int thread_num_streams = num_streams;
7585
#endif
86+
#pragma omp for collapse(2) schedule(dynamic)
7687
for (int i = 0; i < gridt.nbx; i++)
7788
{
7889
for (int j = 0; j < gridt.nby; j++)
@@ -81,12 +92,9 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
8192
// cuda's device is not safe in a multi-threaded environment.
8293

8394
checkCuda(cudaSetDevice(gridt.dev_id));
84-
// get stream id
85-
#ifdef _OPENMP
86-
const int sid = omp_get_thread_num();
87-
#else
88-
const int sid = 0;
89-
#endif
95+
96+
const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start;
97+
checkCuda(cudaEventSynchronize(events[sid]));
9098

9199
int max_m = 0;
92100
int max_n = 0;
@@ -147,6 +155,7 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
147155
gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
148156
gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
149157
dot_product.copy_host_to_device_async(streams[sid], sid);
158+
checkCuda(cudaEventRecord(events[sid], streams[sid]));
150159

151160
psi.memset_device_async(streams[sid], sid, 0);
152161
psi_dm.memset_device_async(streams[sid], sid, 0);
@@ -203,9 +212,9 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
203212
psi_dm.get_device_pointer(sid),
204213
dot_product.get_device_pointer(sid));
205214
checkCudaLastError();
206-
checkCuda(cudaStreamSynchronize(streams[sid]));
207215
}
208216
}
217+
}
209218

210219
// Copy rho from device to host
211220
checkCuda(cudaMemcpy(rho,
@@ -216,6 +225,7 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
216225
for (int i = 0; i < num_streams; i++)
217226
{
218227
checkCuda(cudaStreamDestroy(streams[i]));
228+
checkCuda(cudaEventDestroy(events[i]));
219229
}
220230
}
221231
} // namespace GintKernel

source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,12 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
4141
const double vfactor = ucell.omega / gridt.ncxyz;
4242
const int nczp = nbzp * gridt.bz;
4343
std::vector<cudaStream_t> streams(num_streams);
44+
std::vector<cudaEvent_t> events(num_streams);
4445

4546
for (int i = 0; i < num_streams; i++)
4647
{
4748
checkCuda(cudaStreamCreate(&streams[i]));
49+
checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming));
4850
}
4951

5052
const int nnrg = hRGint->get_nnr();
@@ -72,22 +74,28 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
7274
Cuda_Mem_Wrapper<double*> gemm_B(max_atompair_per_z, num_streams, true);
7375
Cuda_Mem_Wrapper<double*> gemm_C(max_atompair_per_z, num_streams, true);
7476

77+
#pragma omp parallel num_threads(num_streams)
78+
{
7579
#ifdef _OPENMP
76-
#pragma omp parallel for num_threads(num_streams) collapse(2)
80+
const int tid = omp_get_thread_num();
81+
const int num_threads = omp_get_num_threads();
82+
const int sid_start = tid * num_streams / num_threads;
83+
const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads;
84+
#else
85+
const int sid_start = 0;
86+
const int thread_num_streams = num_streams;
7787
#endif
88+
#pragma omp for collapse(2) schedule(dynamic)
7889
for (int i = 0; i < gridt.nbx; i++)
7990
{
8091
for (int j = 0; j < gridt.nby; j++)
8192
{
8293
// 20240620 Note that it must be set again here because
8394
// cuda's device is not safe in a multi-threaded environment.
8495
checkCuda(cudaSetDevice(gridt.dev_id));
85-
#ifdef _OPENMP
86-
const int sid = omp_get_thread_num();
87-
#else
88-
const int sid = 0;
89-
#endif
9096

97+
const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start;
98+
checkCuda(cudaEventSynchronize(events[sid]));
9199
int max_m = 0;
92100
int max_n = 0;
93101
int atom_pair_num = 0;
@@ -141,6 +149,7 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
141149
gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
142150
gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
143151
gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
152+
checkCuda(cudaEventRecord(events[sid], streams[sid]));
144153

145154
psi.memset_device_async(streams[sid], sid, 0);
146155
psi_vldr3.memset_device_async(streams[sid], sid, 0);
@@ -187,9 +196,9 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
187196
streams[sid],
188197
nullptr);
189198
checkCudaLastError();
190-
checkCuda(cudaStreamSynchronize(streams[sid]));
191199
}
192200
}
201+
}
193202

194203
checkCuda(cudaMemcpy(
195204
hRGint->get_wrapper(),
@@ -200,6 +209,7 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
200209
for (int i = 0; i < num_streams; i++)
201210
{
202211
checkCuda(cudaStreamDestroy(streams[i]));
212+
checkCuda(cudaEventDestroy(events[i]));
203213
}
204214
}
205215

0 commit comments

Comments
 (0)