@@ -27,17 +27,22 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
2727 }
2828 // 1. calculate <psi|beta> for each pair of atoms
2929 // loop over all on-site atoms
30- int atom_index = 0 ;
30+ #pragma omp parallel
31+ {
32+ std::vector<double > stress_local (6 , 0 );
33+ ModuleBase::matrix force_local (force.nr , force.nc );
34+ #pragma omp for schedule(dynamic)
3135 for (int iat0 = 0 ; iat0 < this ->ucell ->nat ; iat0++)
3236 {
3337 // skip the atoms without plus-U
3438 auto tau0 = ucell->get_tau (iat0);
3539 int I0 = 0 ;
36- ucell->iat2iait (iat0, &I0, &this ->current_type );
40+ int T0 = 0 ;
41+ ucell->iat2iait (iat0, &I0, &T0);
3742
3843 // first step: find the adjacent atoms and filter the real adjacent atoms
3944 AdjacentAtomInfo adjs;
40- this ->gridD ->Find_atom (*ucell, tau0, this -> current_type , I0, &adjs);
45+ this ->gridD ->Find_atom (*ucell, tau0, T0 , I0, &adjs);
4146
4247 std::vector<bool > is_adj (adjs.adj_num + 1 , false );
4348 for (int ad = 0 ; ad < adjs.adj_num + 1 ; ++ad)
@@ -51,7 +56,7 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
5156 // When equal, the theoretical value of matrix element is zero,
5257 // but the calculated value is not zero due to the numerical error, which would lead to result changes.
5358 if (this ->ucell ->cal_dtau (iat0, iat1, R_index1).norm () * this ->ucell ->lat0
54- < orb_cutoff_[T1] + this ->ucell ->infoNL .Beta [this -> current_type ].get_rcut_max ())
59+ < orb_cutoff_[T1] + this ->ucell ->infoNL .Beta [T0 ].get_rcut_max ())
5560 {
5661 is_adj[ad] = true ;
5762 }
@@ -89,7 +94,7 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
8994 int M1 = (m1 % 2 == 0 ) ? -m1 / 2 : (m1 + 1 ) / 2 ;
9095
9196 ModuleBase::Vector3<double > dtau = tau0 - tau1;
92- intor_->snap (T1, L1, N1, M1, this -> current_type , dtau * this ->ucell ->lat0 , true /* cal_deri*/ , nlm);
97+ intor_->snap (T1, L1, N1, M1, T0 , dtau * this ->ucell ->lat0 , true /* cal_deri*/ , nlm);
9398 // select the elements of nlm with target_L
9499 const int length = nlm[0 ].size ();
95100 std::vector<double > nlm_target (length * 4 );
@@ -111,8 +116,8 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
111116 const int T1 = adjs.ntype [ad1];
112117 const int I1 = adjs.natom [ad1];
113118 const int iat1 = ucell->itia2iat (T1, I1);
114- double * force_tmp1 = (cal_force) ? &force (iat1, 0 ) : nullptr ;
115- double * force_tmp2 = (cal_force) ? &force (iat0, 0 ) : nullptr ;
119+ double * force_tmp1 = (cal_force) ? &force_local (iat1, 0 ) : nullptr ;
120+ double * force_tmp2 = (cal_force) ? &force_local (iat0, 0 ) : nullptr ;
116121 ModuleBase::Vector3<int >& R_index1 = adjs.box [ad1];
117122 ModuleBase::Vector3<double > dis1 = adjs.adjacent_tau [ad1] - tau0;
118123 for (int ad2 = 0 ; ad2 < adjs.adj_num + 1 ; ++ad2)
@@ -139,6 +144,7 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
139144 if (cal_force) {
140145 this ->cal_force_IJR (iat1,
141146 iat2,
147+ T0,
142148 paraV,
143149 nlm_iat0[ad1],
144150 nlm_iat0[ad2],
@@ -151,18 +157,35 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
151157 if (cal_stress) {
152158 this ->cal_stress_IJR (iat1,
153159 iat2,
160+ T0,
154161 paraV,
155162 nlm_iat0[ad1],
156163 nlm_iat0[ad2],
157164 tmp,
158165 dis1,
159166 dis2,
160- stress_tmp .data ());
167+ stress_local .data ());
161168 }
162169 }
163170 }
164171 }
165172 }
173+ #pragma omp critical
174+ {
175+ if (cal_force)
176+ {
177+ force += force_local;
178+ }
179+ if (cal_stress)
180+ {
181+ for (int i = 0 ; i < 6 ; i++)
182+ {
183+ stress_tmp[i] += stress_local[i];
184+ }
185+ }
186+
187+ }
188+ }
166189
167190 if (cal_force)
168191 {
@@ -202,6 +225,7 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
202225template <>
203226void NonlocalNew<OperatorLCAO<std::complex <double >, std::complex <double >>>::cal_force_IJR(const int & iat1,
204227 const int & iat2,
228+ const int & T0,
205229 const Parallel_Orbitals* paraV,
206230 const std::unordered_map<int , std::vector<double >>& nlm1_all,
207231 const std::unordered_map<int , std::vector<double >>& nlm2_all,
@@ -241,11 +265,11 @@ void NonlocalNew<OperatorLCAO<std::complex<double>, std::complex<double>>>::cal_
241265 std::vector<std::complex <double >> nlm_tmp (12 , ModuleBase::ZERO);
242266 for (int is = 0 ; is < 4 ; ++is)
243267 {
244- for (int no = 0 ; no < this ->ucell ->atoms [this -> current_type ].ncpp .non_zero_count_soc [is]; no++)
268+ for (int no = 0 ; no < this ->ucell ->atoms [T0 ].ncpp .non_zero_count_soc [is]; no++)
245269 {
246- const int p1 = this ->ucell ->atoms [this -> current_type ].ncpp .index1_soc [is][no];
247- const int p2 = this ->ucell ->atoms [this -> current_type ].ncpp .index2_soc [is][no];
248- this ->ucell ->atoms [this -> current_type ].ncpp .get_d (is, p1, p2, tmp_d);
270+ const int p1 = this ->ucell ->atoms [T0 ].ncpp .index1_soc [is][no];
271+ const int p2 = this ->ucell ->atoms [T0 ].ncpp .index2_soc [is][no];
272+ this ->ucell ->atoms [T0 ].ncpp .get_d (is, p1, p2, tmp_d);
249273 nlm_tmp[is*3 ] += nlm1[p1 + length] * nlm2[p2] * (*tmp_d);
250274 nlm_tmp[is*3 +1 ] += nlm1[p1 + length * 2 ] * nlm2[p2] * (*tmp_d);
251275 nlm_tmp[is*3 +2 ] += nlm1[p1 + length * 3 ] * nlm2[p2] * (*tmp_d);
@@ -270,6 +294,7 @@ void NonlocalNew<OperatorLCAO<std::complex<double>, std::complex<double>>>::cal_
270294template <>
271295void NonlocalNew<OperatorLCAO<std::complex <double >, std::complex <double >>>::cal_stress_IJR(const int & iat1,
272296 const int & iat2,
297+ const int & T0,
273298 const Parallel_Orbitals* paraV,
274299 const std::unordered_map<int , std::vector<double >>& nlm1_all,
275300 const std::unordered_map<int , std::vector<double >>& nlm2_all,
@@ -311,11 +336,11 @@ void NonlocalNew<OperatorLCAO<std::complex<double>, std::complex<double>>>::cal_
311336 std::vector<std::complex <double >> nlm_tmp (npol2 * 6 , ModuleBase::ZERO);
312337 for (int is = 0 ; is < 4 ; ++is)
313338 {
314- for (int no = 0 ; no < this ->ucell ->atoms [this -> current_type ].ncpp .non_zero_count_soc [is]; no++)
339+ for (int no = 0 ; no < this ->ucell ->atoms [T0 ].ncpp .non_zero_count_soc [is]; no++)
315340 {
316- const int p1 = this ->ucell ->atoms [this -> current_type ].ncpp .index1_soc [is][no];
317- const int p2 = this ->ucell ->atoms [this -> current_type ].ncpp .index2_soc [is][no];
318- this ->ucell ->atoms [this -> current_type ].ncpp .get_d (is, p1, p2, tmp_d);
341+ const int p1 = this ->ucell ->atoms [T0 ].ncpp .index1_soc [is][no];
342+ const int p2 = this ->ucell ->atoms [T0 ].ncpp .index2_soc [is][no];
343+ this ->ucell ->atoms [T0 ].ncpp .get_d (is, p1, p2, tmp_d);
319344 nlm_tmp[is*6 ] += (nlm1[p1 + length] * dis1.x * nlm2[p2] + nlm1[p1] * nlm2[p2 + length] * dis2.x ) * (*tmp_d);
320345 nlm_tmp[is*6 +1 ] += (nlm1[p1 + length] * dis1.y * nlm2[p2] + nlm1[p1] * nlm2[p2 + length] * dis2.y ) * (*tmp_d);
321346 nlm_tmp[is*6 +2 ] += (nlm1[p1 + length] * dis1.z * nlm2[p2] + nlm1[p1] * nlm2[p2 + length] * dis2.z ) * (*tmp_d);
@@ -341,6 +366,7 @@ void NonlocalNew<OperatorLCAO<std::complex<double>, std::complex<double>>>::cal_
341366template <typename TK, typename TR>
342367void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_IJR(const int & iat1,
343368 const int & iat2,
369+ const int & T0,
344370 const Parallel_Orbitals* paraV,
345371 const std::unordered_map<int , std::vector<double >>& nlm1_all,
346372 const std::unordered_map<int , std::vector<double >>& nlm2_all,
@@ -367,11 +393,11 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_IJR(const int& iat1,
367393 assert (nlm1.size () == nlm2.size ());
368394#endif
369395 std::vector<double > nlm_tmp (3 , 0.0 );
370- for (int no = 0 ; no < this ->ucell ->atoms [this -> current_type ].ncpp .non_zero_count_soc [0 ]; no++)
396+ for (int no = 0 ; no < this ->ucell ->atoms [T0 ].ncpp .non_zero_count_soc [0 ]; no++)
371397 {
372- const int p1 = this ->ucell ->atoms [this -> current_type ].ncpp .index1_soc [0 ][no];
373- const int p2 = this ->ucell ->atoms [this -> current_type ].ncpp .index2_soc [0 ][no];
374- this ->ucell ->atoms [this -> current_type ].ncpp .get_d (0 , p1, p2, tmp_d);
398+ const int p1 = this ->ucell ->atoms [T0 ].ncpp .index1_soc [0 ][no];
399+ const int p2 = this ->ucell ->atoms [T0 ].ncpp .index2_soc [0 ][no];
400+ this ->ucell ->atoms [T0 ].ncpp .get_d (0 , p1, p2, tmp_d);
375401 nlm_tmp[0 ] += nlm1[p1 + length] * nlm2[p2] * (*tmp_d);
376402 nlm_tmp[1 ] += nlm1[p1 + length * 2 ] * nlm2[p2] * (*tmp_d);
377403 nlm_tmp[2 ] += nlm1[p1 + length * 3 ] * nlm2[p2] * (*tmp_d);
@@ -390,6 +416,7 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_force_IJR(const int& iat1,
390416template <typename TK, typename TR>
391417void NonlocalNew<OperatorLCAO<TK, TR>>::cal_stress_IJR(const int & iat1,
392418 const int & iat2,
419+ const int & T0,
393420 const Parallel_Orbitals* paraV,
394421 const std::unordered_map<int , std::vector<double >>& nlm1_all,
395422 const std::unordered_map<int , std::vector<double >>& nlm2_all,
@@ -417,11 +444,11 @@ void NonlocalNew<OperatorLCAO<TK, TR>>::cal_stress_IJR(const int& iat1,
417444 assert (nlm1.size () == nlm2.size ());
418445#endif
419446 std::vector<double > nlm_tmp (6 , 0.0 );
420- for (int no = 0 ; no < this ->ucell ->atoms [this -> current_type ].ncpp .non_zero_count_soc [0 ]; no++)
447+ for (int no = 0 ; no < this ->ucell ->atoms [T0 ].ncpp .non_zero_count_soc [0 ]; no++)
421448 {
422- const int p1 = this ->ucell ->atoms [this -> current_type ].ncpp .index1_soc [0 ][no];
423- const int p2 = this ->ucell ->atoms [this -> current_type ].ncpp .index2_soc [0 ][no];
424- this ->ucell ->atoms [this -> current_type ].ncpp .get_d (0 , p1, p2, tmp_d);
449+ const int p1 = this ->ucell ->atoms [T0 ].ncpp .index1_soc [0 ][no];
450+ const int p2 = this ->ucell ->atoms [T0 ].ncpp .index2_soc [0 ][no];
451+ this ->ucell ->atoms [T0 ].ncpp .get_d (0 , p1, p2, tmp_d);
425452 nlm_tmp[0 ] += (nlm1[p1 + length] * dis1.x * nlm2[p2] + nlm1[p1] * nlm2[p2 + length] * dis2.x ) * (*tmp_d);
426453 nlm_tmp[1 ] += (nlm1[p1 + length] * dis1.y * nlm2[p2] + nlm1[p1] * nlm2[p2 + length] * dis2.y ) * (*tmp_d);
427454 nlm_tmp[2 ] += (nlm1[p1 + length] * dis1.z * nlm2[p2] + nlm1[p1] * nlm2[p2 + length] * dis2.z ) * (*tmp_d);
0 commit comments