Skip to content

Commit 7f9dc96

Browse files
authored
Perf: openmp for cal_force_stress (#5956)
* remove wrong timer * omp for cal_force_stress * openmp for cal_force_stress in dftu * openmp for cal_force_stress in dspin * little change * fix bug * fix a bug
1 parent 2825851 commit 7f9dc96

File tree

5 files changed

+101
-39
lines changed

5 files changed

+101
-39
lines changed

source/module_elecstate/module_charge/charge_mixing.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,6 @@ void Charge_Mixing::mix_reset()
193193
bool Charge_Mixing::if_scf_oscillate(const int iteration, const double drho, const int iternum_used, const double threshold)
194194
{
195195
ModuleBase::TITLE("Charge_Mixing", "if_scf_oscillate");
196-
ModuleBase::timer::tick("Charge_Mixing", "if_scf_oscillate");
197196

198197
if(this->_drho_history.size() == 0)
199198
{
@@ -241,7 +240,4 @@ bool Charge_Mixing::if_scf_oscillate(const int iteration, const double drho, con
241240
}
242241

243242
return false;
244-
245-
ModuleBase::timer::tick("Charge_Mixing", "if_scf_oscillate");
246-
247243
}

source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_force_stress.hpp

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,11 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
3939
}
4040
// 1. calculate <psi|beta> for each pair of atoms
4141
// loop over all on-site atoms
42-
int atom_index = 0;
42+
#pragma omp parallel
43+
{
44+
std::vector<double> stress_local(6, 0);
45+
ModuleBase::matrix force_local(force.nr, force.nc);
46+
#pragma omp for schedule(dynamic)
4347
for (int iat0 = 0; iat0 < this->ucell->nat; iat0++)
4448
{
4549
// skip the atoms without plus-U
@@ -51,7 +55,7 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
5155
continue;
5256
}
5357
const int tlp1 = 2 * target_L + 1;
54-
AdjacentAtomInfo& adjs = this->adjs_all[atom_index++];
58+
AdjacentAtomInfo& adjs = this->adjs_all[iat0];
5559

5660
std::vector<std::unordered_map<int, std::vector<double>>> nlm_tot;
5761
nlm_tot.resize(adjs.adj_num + 1);
@@ -156,8 +160,8 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
156160
const int T1 = adjs.ntype[ad1];
157161
const int I1 = adjs.natom[ad1];
158162
const int iat1 = ucell->itia2iat(T1, I1);
159-
double* force_tmp1 = (cal_force) ? &force(iat1, 0) : nullptr;
160-
double* force_tmp2 = (cal_force) ? &force(iat0, 0) : nullptr;
163+
double* force_tmp1 = (cal_force) ? &force_local(iat1, 0) : nullptr;
164+
double* force_tmp2 = (cal_force) ? &force_local(iat0, 0) : nullptr;
161165
ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
162166
ModuleBase::Vector3<double> dis1 = adjs.adjacent_tau[ad1] - tau0;
163167
for (int ad2 = 0; ad2 < adjs.adj_num + 1; ++ad2)
@@ -205,13 +209,27 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
205209
this->nspin,
206210
dis1,
207211
dis2,
208-
stress_tmp.data());
212+
stress_local.data());
209213
}
210214
}
211215
}
212216
}
213217
}
214-
218+
#pragma omp critical
219+
{
220+
if(cal_force)
221+
{
222+
force += force_local;
223+
}
224+
if(cal_stress)
225+
{
226+
for(int i = 0; i < 6; i++)
227+
{
228+
stress_tmp[i] += stress_local[i];
229+
}
230+
}
231+
}
232+
}
215233
if (cal_force)
216234
{
217235
#ifdef __MPI

source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dspin_force_stress.hpp

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,11 @@ void DeltaSpin<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
3232
}
3333
// 1. calculate <psi|beta> for each pair of atoms
3434
// loop over all on-site atoms
35-
int atom_index = 0;
35+
#pragma omp parallel
36+
{
37+
std::vector<double> stress_local(6, 0);
38+
ModuleBase::matrix force_local(force.nr, force.nc);
39+
#pragma omp for schedule(dynamic)
3640
for (int iat0 = 0; iat0 < this->ucell->nat; iat0++)
3741
{
3842
if(!this->constraint_atom_list[iat0])
@@ -133,8 +137,8 @@ void DeltaSpin<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
133137
const int T1 = adjs.ntype[ad1];
134138
const int I1 = adjs.natom[ad1];
135139
const int iat1 = ucell->itia2iat(T1, I1);
136-
double* force_tmp1 = (cal_force) ? &force(iat1, 0) : nullptr;
137-
double* force_tmp2 = (cal_force) ? &force(iat0, 0) : nullptr;
140+
double* force_tmp1 = (cal_force) ? &force_local(iat1, 0) : nullptr;
141+
double* force_tmp2 = (cal_force) ? &force_local(iat0, 0) : nullptr;
138142
ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
139143
ModuleBase::Vector3<double> dis1 = adjs.adjacent_tau[ad1] - tau0;
140144
for (int ad2 = 0; ad2 < adjs.adj_num + 1; ++ad2)
@@ -183,12 +187,27 @@ void DeltaSpin<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
183187
this->nspin,
184188
dis1,
185189
dis2,
186-
stress_tmp.data());
190+
stress_local.data());
187191
}
188192
}
189193
}
190194
}
191195
}
196+
#pragma omp critical
197+
{
198+
if(cal_force)
199+
{
200+
force += force_local;
201+
}
202+
if(cal_stress)
203+
{
204+
for(int i = 0; i < 6; i++)
205+
{
206+
stress_tmp[i] += stress_local[i];
207+
}
208+
}
209+
}
210+
}
192211

193212
if (cal_force)
194213
{

source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_force_stress.hpp

Lines changed: 51 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,22 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
2727
}
2828
// 1. calculate <psi|beta> for each pair of atoms
2929
// loop over all on-site atoms
30-
int atom_index = 0;
30+
#pragma omp parallel
31+
{
32+
std::vector<double> stress_local(6, 0);
33+
ModuleBase::matrix force_local(force.nr, force.nc);
34+
#pragma omp for schedule(dynamic)
3135
for (int iat0 = 0; iat0 < this->ucell->nat; iat0++)
3236
{
3337
// skip the atoms without plus-U
3438
auto tau0 = ucell->get_tau(iat0);
3539
int I0 = 0;
36-
ucell->iat2iait(iat0, &I0, &this->current_type);
40+
int T0 = 0;
41+
ucell->iat2iait(iat0, &I0, &T0);
3742

3843
// first step: find the adjacent atoms and filter the real adjacent atoms
3944
AdjacentAtomInfo adjs;
40-
this->gridD->Find_atom(*ucell, tau0, this->current_type, I0, &adjs);
45+
this->gridD->Find_atom(*ucell, tau0, T0, I0, &adjs);
4146

4247
std::vector<bool> is_adj(adjs.adj_num + 1, false);
4348
for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
@@ -51,7 +56,7 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
5156
// When equal, the theoretical value of matrix element is zero,
5257
// but the calculated value is not zero due to the numerical error, which would lead to result changes.
5358
if (this->ucell->cal_dtau(iat0, iat1, R_index1).norm() * this->ucell->lat0
54-
< orb_cutoff_[T1] + this->ucell->infoNL.Beta[this->current_type].get_rcut_max())
59+
< orb_cutoff_[T1] + this->ucell->infoNL.Beta[T0].get_rcut_max())
5560
{
5661
is_adj[ad] = true;
5762
}
@@ -89,7 +94,7 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
8994
int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
9095

9196
ModuleBase::Vector3<double> dtau = tau0 - tau1;
92-
intor_->snap(T1, L1, N1, M1, this->current_type, dtau * this->ucell->lat0, true /*cal_deri*/, nlm);
97+
intor_->snap(T1, L1, N1, M1, T0, dtau * this->ucell->lat0, true /*cal_deri*/, nlm);
9398
// select the elements of nlm with target_L
9499
const int length = nlm[0].size();
95100
std::vector<double> nlm_target(length * 4);
@@ -111,8 +116,8 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
111116
const int T1 = adjs.ntype[ad1];
112117
const int I1 = adjs.natom[ad1];
113118
const int iat1 = ucell->itia2iat(T1, I1);
114-
double* force_tmp1 = (cal_force) ? &force(iat1, 0) : nullptr;
115-
double* force_tmp2 = (cal_force) ? &force(iat0, 0) : nullptr;
119+
double* force_tmp1 = (cal_force) ? &force_local(iat1, 0) : nullptr;
120+
double* force_tmp2 = (cal_force) ? &force_local(iat0, 0) : nullptr;
116121
ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
117122
ModuleBase::Vector3<double> dis1 = adjs.adjacent_tau[ad1] - tau0;
118123
for (int ad2 = 0; ad2 < adjs.adj_num + 1; ++ad2)
@@ -139,6 +144,7 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
139144
if (cal_force) {
140145
this->cal_force_IJR(iat1,
141146
iat2,
147+
T0,
142148
paraV,
143149
nlm_iat0[ad1],
144150
nlm_iat0[ad2],
@@ -151,18 +157,35 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
151157
if (cal_stress) {
152158
this->cal_stress_IJR(iat1,
153159
iat2,
160+
T0,
154161
paraV,
155162
nlm_iat0[ad1],
156163
nlm_iat0[ad2],
157164
tmp,
158165
dis1,
159166
dis2,
160-
stress_tmp.data());
167+
stress_local.data());
161168
}
162169
}
163170
}
164171
}
165172
}
173+
#pragma omp critical
174+
{
175+
if(cal_force)
176+
{
177+
force += force_local;
178+
}
179+
if(cal_stress)
180+
{
181+
for(int i = 0; i < 6; i++)
182+
{
183+
stress_tmp[i] += stress_local[i];
184+
}
185+
}
186+
187+
}
188+
}
166189

167190
if (cal_force)
168191
{
@@ -202,6 +225,7 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
202225
template <>
203226
void NonlocalNew<OperatorLCAO<std::complex<double>, std::complex<double>>>::cal_force_IJR(const int& iat1,
204227
const int& iat2,
228+
const int& T0,
205229
const Parallel_Orbitals* paraV,
206230
const std::unordered_map<int, std::vector<double>>& nlm1_all,
207231
const std::unordered_map<int, std::vector<double>>& nlm2_all,
@@ -241,11 +265,11 @@ void NonlocalNew<OperatorLCAO<std::complex<double>, std::complex<double>>>::cal_
241265
std::vector<std::complex<double>> nlm_tmp(12, ModuleBase::ZERO);
242266
for (int is = 0; is < 4; ++is)
243267
{
244-
for (int no = 0; no < this->ucell->atoms[this->current_type].ncpp.non_zero_count_soc[is]; no++)
268+
for (int no = 0; no < this->ucell->atoms[T0].ncpp.non_zero_count_soc[is]; no++)
245269
{
246-
const int p1 = this->ucell->atoms[this->current_type].ncpp.index1_soc[is][no];
247-
const int p2 = this->ucell->atoms[this->current_type].ncpp.index2_soc[is][no];
248-
this->ucell->atoms[this->current_type].ncpp.get_d(is, p1, p2, tmp_d);
270+
const int p1 = this->ucell->atoms[T0].ncpp.index1_soc[is][no];
271+
const int p2 = this->ucell->atoms[T0].ncpp.index2_soc[is][no];
272+
this->ucell->atoms[T0].ncpp.get_d(is, p1, p2, tmp_d);
249273
nlm_tmp[is*3] += nlm1[p1 + length] * nlm2[p2] * (*tmp_d);
250274
nlm_tmp[is*3+1] += nlm1[p1 + length * 2] * nlm2[p2] * (*tmp_d);
251275
nlm_tmp[is*3+2] += nlm1[p1 + length * 3] * nlm2[p2] * (*tmp_d);
@@ -270,6 +294,7 @@ void NonlocalNew<OperatorLCAO<std::complex<double>, std::complex<double>>>::cal_
270294
template <>
271295
void NonlocalNew<OperatorLCAO<std::complex<double>, std::complex<double>>>::cal_stress_IJR(const int& iat1,
272296
const int& iat2,
297+
const int& T0,
273298
const Parallel_Orbitals* paraV,
274299
const std::unordered_map<int, std::vector<double>>& nlm1_all,
275300
const std::unordered_map<int, std::vector<double>>& nlm2_all,
@@ -311,11 +336,11 @@ void NonlocalNew<OperatorLCAO<std::complex<double>, std::complex<double>>>::cal_
311336
std::vector<std::complex<double>> nlm_tmp(npol2 * 6, ModuleBase::ZERO);
312337
for (int is = 0; is < 4; ++is)
313338
{
314-
for (int no = 0; no < this->ucell->atoms[this->current_type].ncpp.non_zero_count_soc[is]; no++)
339+
for (int no = 0; no < this->ucell->atoms[T0].ncpp.non_zero_count_soc[is]; no++)
315340
{
316-
const int p1 = this->ucell->atoms[this->current_type].ncpp.index1_soc[is][no];
317-
const int p2 = this->ucell->atoms[this->current_type].ncpp.index2_soc[is][no];
318-
this->ucell->atoms[this->current_type].ncpp.get_d(is, p1, p2, tmp_d);
341+
const int p1 = this->ucell->atoms[T0].ncpp.index1_soc[is][no];
342+
const int p2 = this->ucell->atoms[T0].ncpp.index2_soc[is][no];
343+
this->ucell->atoms[T0].ncpp.get_d(is, p1, p2, tmp_d);
319344
nlm_tmp[is*6] += (nlm1[p1 + length] * dis1.x * nlm2[p2] + nlm1[p1] * nlm2[p2 + length] * dis2.x) * (*tmp_d);
320345
nlm_tmp[is*6+1] += (nlm1[p1 + length] * dis1.y * nlm2[p2] + nlm1[p1] * nlm2[p2 + length] * dis2.y) * (*tmp_d);
321346
nlm_tmp[is*6+2] += (nlm1[p1 + length] * dis1.z * nlm2[p2] + nlm1[p1] * nlm2[p2 + length] * dis2.z) * (*tmp_d);
@@ -341,6 +366,7 @@ void NonlocalNew<OperatorLCAO<std::complex<double>, std::complex<double>>>::cal_
341366
template <typename TK, typename TR>
342367
void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_IJR(const int& iat1,
343368
const int& iat2,
369+
const int& T0,
344370
const Parallel_Orbitals* paraV,
345371
const std::unordered_map<int, std::vector<double>>& nlm1_all,
346372
const std::unordered_map<int, std::vector<double>>& nlm2_all,
@@ -367,11 +393,11 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_IJR(const int& iat1,
367393
assert(nlm1.size() == nlm2.size());
368394
#endif
369395
std::vector<double> nlm_tmp(3, 0.0);
370-
for (int no = 0; no < this->ucell->atoms[this->current_type].ncpp.non_zero_count_soc[0]; no++)
396+
for (int no = 0; no < this->ucell->atoms[T0].ncpp.non_zero_count_soc[0]; no++)
371397
{
372-
const int p1 = this->ucell->atoms[this->current_type].ncpp.index1_soc[0][no];
373-
const int p2 = this->ucell->atoms[this->current_type].ncpp.index2_soc[0][no];
374-
this->ucell->atoms[this->current_type].ncpp.get_d(0, p1, p2, tmp_d);
398+
const int p1 = this->ucell->atoms[T0].ncpp.index1_soc[0][no];
399+
const int p2 = this->ucell->atoms[T0].ncpp.index2_soc[0][no];
400+
this->ucell->atoms[T0].ncpp.get_d(0, p1, p2, tmp_d);
375401
nlm_tmp[0] += nlm1[p1 + length] * nlm2[p2] * (*tmp_d);
376402
nlm_tmp[1] += nlm1[p1 + length * 2] * nlm2[p2] * (*tmp_d);
377403
nlm_tmp[2] += nlm1[p1 + length * 3] * nlm2[p2] * (*tmp_d);
@@ -390,6 +416,7 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_IJR(const int& iat1,
390416
template <typename TK, typename TR>
391417
void NonlocalNew<OperatorLCAO<TK, TR>>::cal_stress_IJR(const int& iat1,
392418
const int& iat2,
419+
const int& T0,
393420
const Parallel_Orbitals* paraV,
394421
const std::unordered_map<int, std::vector<double>>& nlm1_all,
395422
const std::unordered_map<int, std::vector<double>>& nlm2_all,
@@ -417,11 +444,11 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_stress_IJR(const int& iat1,
417444
assert(nlm1.size() == nlm2.size());
418445
#endif
419446
std::vector<double> nlm_tmp(6, 0.0);
420-
for (int no = 0; no < this->ucell->atoms[this->current_type].ncpp.non_zero_count_soc[0]; no++)
447+
for (int no = 0; no < this->ucell->atoms[T0].ncpp.non_zero_count_soc[0]; no++)
421448
{
422-
const int p1 = this->ucell->atoms[this->current_type].ncpp.index1_soc[0][no];
423-
const int p2 = this->ucell->atoms[this->current_type].ncpp.index2_soc[0][no];
424-
this->ucell->atoms[this->current_type].ncpp.get_d(0, p1, p2, tmp_d);
449+
const int p1 = this->ucell->atoms[T0].ncpp.index1_soc[0][no];
450+
const int p2 = this->ucell->atoms[T0].ncpp.index2_soc[0][no];
451+
this->ucell->atoms[T0].ncpp.get_d(0, p1, p2, tmp_d);
425452
nlm_tmp[0] += (nlm1[p1 + length] * dis1.x * nlm2[p2] + nlm1[p1] * nlm2[p2 + length] * dis2.x) * (*tmp_d);
426453
nlm_tmp[1] += (nlm1[p1 + length] * dis1.y * nlm2[p2] + nlm1[p1] * nlm2[p2 + length] * dis2.y) * (*tmp_d);
427454
nlm_tmp[2] += (nlm1[p1 + length] * dis1.z * nlm2[p2] + nlm1[p1] * nlm2[p2 + length] * dis2.z) * (*tmp_d);

source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,12 +102,13 @@ class NonlocalNew<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
102102
TR* data_pointer);
103103

104104
const Grid_Driver* gridD = nullptr;
105-
int current_type = 0;
105+
106106
/**
107107
* @brief calculate the atomic Force of <I,J,R> atom pair
108108
*/
109109
void cal_force_IJR(const int& iat1,
110110
const int& iat2,
111+
const int& T0,
111112
const Parallel_Orbitals* paraV,
112113
const std::unordered_map<int, std::vector<double>>& nlm1_all,
113114
const std::unordered_map<int, std::vector<double>>& nlm2_all,
@@ -119,6 +120,7 @@ class NonlocalNew<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
119120
*/
120121
void cal_stress_IJR(const int& iat1,
121122
const int& iat2,
123+
const int& T0,
122124
const Parallel_Orbitals* paraV,
123125
const std::unordered_map<int, std::vector<double>>& nlm1_all,
124126
const std::unordered_map<int, std::vector<double>>& nlm2_all,

0 commit comments

Comments
 (0)