|
8 | 8 | #include <map> |
9 | 9 | #include "../module_base/timer.h" |
10 | 10 |
|
| 11 | +#ifdef __MKL |
| 12 | +#include <mkl_service.h> |
| 13 | +#endif |
| 14 | + |
| 15 | +#ifdef _OPENMP |
| 16 | +#include <omp.h> |
| 17 | +#endif |
| 18 | + |
11 | 19 | LCAO_gen_fixedH::LCAO_gen_fixedH() |
12 | 20 | {} |
13 | 21 |
|
@@ -1135,6 +1143,214 @@ void LCAO_gen_fixedH::build_Nonlocal_beta_new() //update by liuyu 2021-04-07 |
1135 | 1143 | ModuleBase::TITLE("LCAO_gen_fixedH","build_Nonlocal_beta_new"); |
1136 | 1144 | ModuleBase::timer::tick ("LCAO_gen_fixedH","build_Nonlocal_beta_new"); |
1137 | 1145 |
|
| 1146 | +#ifdef __MKL |
| 1147 | + const int mkl_threads = mkl_get_max_threads(); |
| 1148 | + mkl_set_num_threads(1); |
| 1149 | +#endif |
| 1150 | + |
| 1151 | + const std::vector<std::vector<std::tuple<int, int, ModuleBase::Vector3<int>, ModuleBase::Vector3<double>>>> adjs_all = GlobalC::GridD.get_adjs(); |
| 1152 | + |
| 1153 | +#ifdef _OPENMP |
| 1154 | + #pragma omp parallel |
| 1155 | + { |
| 1156 | + double* Nonlocal_thread; |
| 1157 | + Nonlocal_thread = new double[GlobalC::ParaO.nloc]; |
| 1158 | + ModuleBase::GlobalFunc::ZEROS(Nonlocal_thread, GlobalC::ParaO.nloc); |
| 1159 | + #pragma omp for schedule(dynamic) |
| 1160 | +#endif |
| 1161 | + for(int iat=0; iat<GlobalC::ucell.nat; iat++) |
| 1162 | + { |
| 1163 | + const int T0 = GlobalC::ucell.iat2it[iat]; |
| 1164 | + const int I0 = GlobalC::ucell.iat2ia[iat]; |
| 1165 | + Atom* atom0 = &GlobalC::ucell.atoms[T0]; |
| 1166 | + |
| 1167 | + //======================================================= |
| 1168 | + //Step1: |
| 1169 | + //saves <beta|psi>, where beta runs over L0,M0 on atom I0 |
| 1170 | + //and psi runs over atomic basis sets on the current core |
| 1171 | + //======================================================= |
| 1172 | + #ifdef _OPENMP |
| 1173 | + std::vector<std::unordered_map<int,std::vector<double>>> nlm_tot_thread; |
| 1174 | + nlm_tot_thread.resize(adjs_all[iat].size()); |
| 1175 | + #else |
| 1176 | + std::vector<std::unordered_map<int,std::vector<double>>> nlm_tot; |
| 1177 | + nlm_tot.resize(GlobalC::GridD.getAdjacentNum()+1); |
| 1178 | + #endif |
| 1179 | + |
| 1180 | + const ModuleBase::Vector3<double> tau0 = atom0->tau[I0]; |
| 1181 | + const double Rcut_Beta = GlobalC::ucell.infoNL.Beta[T0].get_rcut_max(); |
| 1182 | + |
| 1183 | + //outermost loop : all adjacent atoms |
| 1184 | + for(int ad_count=0; ad_count < adjs_all[iat].size(); ad_count++) |
| 1185 | + { |
| 1186 | + const int T1 = std::get<0>(adjs_all[iat][ad_count]); |
| 1187 | + const int I1 = std::get<1>(adjs_all[iat][ad_count]); |
| 1188 | + const int start1 = GlobalC::ucell.itiaiw2iwt(T1, I1, 0); |
| 1189 | + const double Rcut_AO1 = GlobalC::ORB.Phi[T1].getRcut(); |
| 1190 | + const ModuleBase::Vector3<double> tau1 = std::get<3>(adjs_all[iat][ad_count]); |
| 1191 | + const Atom* atom1 = &GlobalC::ucell.atoms[T1]; |
| 1192 | + const int nw1_tot = atom1->nw*GlobalV::NPOL; |
| 1193 | + |
| 1194 | + #ifdef _OPENMP |
| 1195 | + nlm_tot_thread[ad_count].clear(); |
| 1196 | + #else |
| 1197 | + nlm_tot[ad_count].clear(); |
| 1198 | + #endif |
| 1199 | + |
| 1200 | + //middle loop : atomic basis on current processor (either row or column) |
| 1201 | + const double dist1 = (tau1-tau0).norm() * GlobalC::ucell.lat0; |
| 1202 | + if (dist1 > Rcut_Beta + Rcut_AO1) |
| 1203 | + { |
| 1204 | + continue; |
| 1205 | + } |
| 1206 | + |
| 1207 | + for(int iw1=0; iw1<nw1_tot; ++iw1) |
| 1208 | + { |
| 1209 | + const int iw1_all = start1 + iw1; |
| 1210 | + const int iw1_local = GlobalC::ParaO.trace_loc_row[iw1_all]; |
| 1211 | + const int iw2_local = GlobalC::ParaO.trace_loc_col[iw1_all]; |
| 1212 | + |
| 1213 | + if(iw1_local < 0 && iw2_local < 0) continue; |
| 1214 | + |
| 1215 | + const int iw1_0 = iw1/GlobalV::NPOL; |
| 1216 | + std::vector<std::vector<double>> nlm; |
| 1217 | + //2D, but first dimension is only 1 here |
| 1218 | + //for force, the right hand side is the gradient |
| 1219 | + //and the first dimension is then 3 |
| 1220 | + //inner loop : all projectors (L0,M0) |
| 1221 | + GlobalC::UOT.snap_psibeta_half( |
| 1222 | + GlobalC::ORB, |
| 1223 | + GlobalC::ucell.infoNL, |
| 1224 | + nlm, tau1, T1, |
| 1225 | + atom1->iw2l[ iw1_0 ], // L1 |
| 1226 | + atom1->iw2m[ iw1_0 ], // m1 |
| 1227 | + atom1->iw2n[ iw1_0 ], // N1 |
| 1228 | + GlobalC::ucell.atoms[T0].tau[I0], T0, 0); //R0,T0 |
| 1229 | + |
| 1230 | + #ifdef _OPENMP |
| 1231 | + nlm_tot_thread[ad_count].insert({iw1_all,nlm[0]}); |
| 1232 | + #else |
| 1233 | + nlm_tot[ad_count].insert({iw1_all,nlm[0]}); |
| 1234 | + #endif |
| 1235 | + }//end iw |
| 1236 | + }//end ad |
| 1237 | + |
| 1238 | + //======================================================= |
| 1239 | + //Step2: |
| 1240 | + //calculate sum_(L0,M0) beta<psi_i|beta><beta|psi_j> |
| 1241 | + //and accumulate the value to Hloc_fixed(i,j) |
| 1242 | + //======================================================= |
| 1243 | + for(int ad1_count=0; ad1_count < adjs_all[iat].size(); ad1_count++) |
| 1244 | + { |
| 1245 | + const int T1 = std::get<0>(adjs_all[iat][ad1_count]); |
| 1246 | + const int I1 = std::get<1>(adjs_all[iat][ad1_count]); |
| 1247 | + const int start1 = GlobalC::ucell.itiaiw2iwt(T1, I1, 0); |
| 1248 | + const ModuleBase::Vector3<double> tau1 = std::get<3>(adjs_all[iat][ad1_count]); |
| 1249 | + const Atom* atom1 = &GlobalC::ucell.atoms[T1]; |
| 1250 | + const int nw1_tot = atom1->nw*GlobalV::NPOL; |
| 1251 | + const double Rcut_AO1 = GlobalC::ORB.Phi[T1].getRcut(); |
| 1252 | + |
| 1253 | + for (int ad2_count=0; ad2_count < adjs_all[iat].size(); ad2_count++) |
| 1254 | + { |
| 1255 | + const int T2 = std::get<0>(adjs_all[iat][ad2_count]); |
| 1256 | + const int I2 = std::get<1>(adjs_all[iat][ad2_count]); |
| 1257 | + const int start2 = GlobalC::ucell.itiaiw2iwt(T2, I2, 0); |
| 1258 | + const ModuleBase::Vector3<double> tau2 = std::get<3>(adjs_all[iat][ad2_count]); |
| 1259 | + const Atom* atom2 = &GlobalC::ucell.atoms[T2]; |
| 1260 | + const int nw2_tot = atom2->nw*GlobalV::NPOL; |
| 1261 | + const double Rcut_AO2 = GlobalC::ORB.Phi[T2].getRcut(); |
| 1262 | + const double dist1 = (tau1-tau0).norm() * GlobalC::ucell.lat0; |
| 1263 | + const double dist2 = (tau2-tau0).norm() * GlobalC::ucell.lat0; |
| 1264 | + |
| 1265 | + if (dist1 > Rcut_Beta + Rcut_AO1 |
| 1266 | + || dist2 > Rcut_Beta + Rcut_AO2) |
| 1267 | + { |
| 1268 | + continue; |
| 1269 | + } |
| 1270 | + |
| 1271 | + for(int iw1=0; iw1<nw1_tot; ++iw1) |
| 1272 | + { |
| 1273 | + const int iw1_all = start1 + iw1; |
| 1274 | + const int iw1_local = GlobalC::ParaO.trace_loc_row[iw1_all]; |
| 1275 | + if(iw1_local < 0) continue; |
| 1276 | + const int iw1_0 = iw1/GlobalV::NPOL; |
| 1277 | + for(int iw2=0; iw2<nw2_tot; ++iw2) |
| 1278 | + { |
| 1279 | + const int iw2_all = start2 + iw2; |
| 1280 | + const int iw2_local = GlobalC::ParaO.trace_loc_col[iw2_all]; |
| 1281 | + if(iw2_local < 0) continue; |
| 1282 | + const int iw2_0 = iw2/GlobalV::NPOL; |
| 1283 | + #ifdef _OPENMP |
| 1284 | + std::vector<double> nlm1 = nlm_tot_thread[ad1_count][iw1_all]; |
| 1285 | + std::vector<double> nlm2 = nlm_tot_thread[ad2_count][iw2_all]; |
| 1286 | + #else |
| 1287 | + std::vector<double> nlm1 = nlm_tot[ad1_count][iw1_all]; |
| 1288 | + std::vector<double> nlm2 = nlm_tot[ad2_count][iw2_all]; |
| 1289 | + #endif |
| 1290 | + |
| 1291 | + assert(nlm1.size()==nlm2.size()); |
| 1292 | + #ifdef _OPENMP |
| 1293 | + double nlm_thread=0.0; |
| 1294 | + #else |
| 1295 | + double nlm=0.0; |
| 1296 | + #endif |
| 1297 | + const int nproj = GlobalC::ucell.infoNL.nproj[T0]; |
| 1298 | + int ib = 0; |
| 1299 | + for(int nb = 0; nb < nproj; nb++) |
| 1300 | + { |
| 1301 | + const int L0 = GlobalC::ucell.infoNL.Beta[T0].Proj[nb].getL(); |
| 1302 | + for(int m=0;m<2*L0+1;m++) |
| 1303 | + { |
| 1304 | + #ifdef _OPENMP |
| 1305 | + nlm_thread += nlm1[ib]*nlm2[ib]*GlobalC::ucell.atoms[T0].dion(nb,nb); |
| 1306 | + #else |
| 1307 | + nlm += nlm1[ib]*nlm2[ib]*GlobalC::ucell.atoms[T0].dion(nb,nb); |
| 1308 | + #endif |
| 1309 | + ib+=1; |
| 1310 | + } |
| 1311 | + } |
| 1312 | + assert(ib==nlm1.size()); |
| 1313 | + |
| 1314 | + const int ir = GlobalC::ParaO.trace_loc_row[ iw1_all ]; |
| 1315 | + const int ic = GlobalC::ParaO.trace_loc_col[ iw2_all ]; |
| 1316 | + long index=0; |
| 1317 | + if(GlobalV::KS_SOLVER=="genelpa" || GlobalV::KS_SOLVER=="scalapack_gvx") |
| 1318 | + { |
| 1319 | + index=ic*GlobalC::ParaO.nrow+ir; |
| 1320 | + } |
| 1321 | + else |
| 1322 | + { |
| 1323 | + index=ir*GlobalC::ParaO.ncol+ic; |
| 1324 | + } |
| 1325 | + #ifdef _OPENMP |
| 1326 | + Nonlocal_thread[index] += nlm_thread; |
| 1327 | + #else |
| 1328 | + GlobalC::LM.set_HSgamma(iw1_all,iw2_all,nlm,'N'); |
| 1329 | + #endif |
| 1330 | + }//iw2 |
| 1331 | + }//iw1 |
| 1332 | + }//ad2 |
| 1333 | + }//ad1 |
| 1334 | + }//end iat |
| 1335 | + |
| 1336 | + #ifdef _OPENMP |
| 1337 | + #pragma omp critical(cal_nonlocal) |
| 1338 | + { |
| 1339 | + for(int i=0; i<GlobalC::ParaO.nloc; i++) |
| 1340 | + { |
| 1341 | + GlobalC::LM.Hloc_fixed[i] += Nonlocal_thread[i]; |
| 1342 | + } |
| 1343 | + } |
| 1344 | + delete[] Nonlocal_thread; |
| 1345 | + #endif |
| 1346 | +#ifdef _OPENMP |
| 1347 | + } |
| 1348 | +#endif |
| 1349 | + |
| 1350 | +#ifdef __MKL |
| 1351 | + mkl_set_num_threads(mkl_threads); |
| 1352 | +#endif |
| 1353 | +/* |
1138 | 1354 | for (int T0 = 0; T0 < GlobalC::ucell.ntype; T0++) |
1139 | 1355 | { |
1140 | 1356 | Atom* atom0 = &GlobalC::ucell.atoms[T0]; |
@@ -1274,6 +1490,7 @@ void LCAO_gen_fixedH::build_Nonlocal_beta_new() //update by liuyu 2021-04-07 |
1274 | 1490 |
|
1275 | 1491 | }//end I0 |
1276 | 1492 | }//end T0 |
| 1493 | +*/ |
1277 | 1494 |
|
1278 | 1495 | ModuleBase::timer::tick ("LCAO_gen_fixedH","build_Nonlocal_beta_new"); |
1279 | 1496 | return; |
|
0 commit comments