Skip to content

Commit 7d26f9c

Browse files
committed
OpenMP optimization for nonlocal pseudo-potential under gamma-only line
1 parent 4fcb6ed commit 7d26f9c

File tree

3 files changed

+252
-0
lines changed

3 files changed

+252
-0
lines changed

source/module_neighbor/sltk_grid_driver.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "sltk_grid_driver.h"
22
#include "../src_pw/tools.h"
3+
#include "../src_pw/global.h"
34

45
Grid_Driver::Grid_Driver(
56
const int &test_d_in,
@@ -312,3 +313,33 @@ ModuleBase::Vector3<double> Grid_Driver::Calculate_adjacent_site
312313

313314
return adjacent_site;
314315
}
316+
317+
std::vector<std::tuple<int, int, ModuleBase::Vector3<int>, ModuleBase::Vector3<double>>> Grid_Driver::get_adjs(const size_t &iat)
318+
{
319+
const int it = GlobalC::ucell.iat2it[iat];
320+
const int ia = GlobalC::ucell.iat2ia[iat];
321+
const ModuleBase::Vector3<double> &tau = GlobalC::ucell.atoms[it].tau[ia];
322+
323+
std::vector<std::tuple<int, int, ModuleBase::Vector3<int>, ModuleBase::Vector3<double>>> adjs;
324+
GlobalC::GridD.Find_atom(GlobalC::ucell, tau, it, ia);
325+
for(int ad=0; ad<GlobalC::GridD.getAdjacentNum()+1; ad++)
326+
{
327+
const size_t it_ad = GlobalC::GridD.getType(ad);
328+
const size_t ia_ad = GlobalC::GridD.getNatom(ad);
329+
const ModuleBase::Vector3<int> box_ad = GlobalC::GridD.getBox(ad);
330+
const ModuleBase::Vector3<double> tau_ad = GlobalC::GridD.getAdjacentTau(ad);
331+
332+
adjs.push_back(std::make_tuple(it_ad, ia_ad, box_ad, tau_ad));
333+
}
334+
return adjs;
335+
}
336+
337+
std::vector<std::vector<std::tuple<int, int, ModuleBase::Vector3<int>, ModuleBase::Vector3<double>>>> Grid_Driver::get_adjs()
338+
{
339+
std::vector<std::vector<std::tuple<int, int, ModuleBase::Vector3<int>, ModuleBase::Vector3<double>>>> adjs(GlobalC::ucell.nat);
340+
for(size_t iat=0; iat<GlobalC::ucell.nat; iat++)
341+
{
342+
adjs[iat] = Grid_Driver::get_adjs(iat);
343+
}
344+
return adjs;
345+
}

source/module_neighbor/sltk_grid_driver.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "sltk_grid.h"
99
#include "../src_pw/tools.h"
1010
#include "../src_pw/pw_basis.h"
11+
#include <tuple>
1112

1213
class Grid_Driver : public Grid
1314
{
@@ -48,6 +49,9 @@ class Grid_Driver : public Grid
4849
const ModuleBase::Vector3<double>& getAdjacentTau(const int i) const { return adjacent_tau[i]; }
4950
const ModuleBase::Vector3<int>& getBox(const int i) const {return box[i];}
5051

52+
std::vector<std::tuple<int, int, ModuleBase::Vector3<int>, ModuleBase::Vector3<double>>> get_adjs(const size_t &iat);
53+
std::vector<std::vector<std::tuple<int, int, ModuleBase::Vector3<int>, ModuleBase::Vector3<double>>>> get_adjs();
54+
5155
private:
5256

5357
mutable int adj_num;

source/src_lcao/LCAO_gen_fixedH.cpp

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@
77
#include <unordered_map>
88
#include <map>
99

10+
#ifdef __MKL
11+
#include <mkl_service.h>
12+
#endif
13+
14+
#ifdef _OPENMP
15+
#include <omp.h>
16+
#endif
17+
1018
LCAO_gen_fixedH::LCAO_gen_fixedH()
1119
{}
1220

@@ -1134,6 +1142,214 @@ void LCAO_gen_fixedH::build_Nonlocal_beta_new() //update by liuyu 2021-04-07
11341142
ModuleBase::TITLE("LCAO_gen_fixedH","build_Nonlocal_beta_new");
11351143
ModuleBase::timer::tick ("LCAO_gen_fixedH","build_Nonlocal_beta_new");
11361144

1145+
#ifdef __MKL
1146+
const int mkl_threads = mkl_get_max_threads();
1147+
mkl_set_num_threads(1);
1148+
#endif
1149+
1150+
const std::vector<std::vector<std::tuple<int, int, ModuleBase::Vector3<int>, ModuleBase::Vector3<double>>>> adjs_all = GlobalC::GridD.get_adjs();
1151+
1152+
#ifdef _OPENMP
1153+
#pragma omp parallel
1154+
{
1155+
double* Nonlocal_thread;
1156+
Nonlocal_thread = new double[GlobalC::ParaO.nloc];
1157+
ModuleBase::GlobalFunc::ZEROS(Nonlocal_thread, GlobalC::ParaO.nloc);
1158+
#pragma omp for schedule(dynamic)
1159+
#endif
1160+
for(int iat=0; iat<GlobalC::ucell.nat; iat++)
1161+
{
1162+
const int T0 = GlobalC::ucell.iat2it[iat];
1163+
const int I0 = GlobalC::ucell.iat2ia[iat];
1164+
Atom* atom0 = &GlobalC::ucell.atoms[T0];
1165+
1166+
//=======================================================
1167+
//Step1:
1168+
//saves <beta|psi>, where beta runs over L0,M0 on atom I0
1169+
//and psi runs over atomic basis sets on the current core
1170+
//=======================================================
1171+
#ifdef _OPENMP
1172+
std::vector<std::unordered_map<int,std::vector<double>>> nlm_tot_thread;
1173+
nlm_tot_thread.resize(adjs_all[iat].size());
1174+
#else
1175+
std::vector<std::unordered_map<int,std::vector<double>>> nlm_tot;
1176+
nlm_tot.resize(GlobalC::GridD.getAdjacentNum()+1);
1177+
#endif
1178+
1179+
const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
1180+
const double Rcut_Beta = GlobalC::ucell.infoNL.Beta[T0].get_rcut_max();
1181+
1182+
//outermost loop : all adjacent atoms
1183+
for(int ad_count=0; ad_count < adjs_all[iat].size(); ad_count++)
1184+
{
1185+
const int T1 = std::get<0>(adjs_all[iat][ad_count]);
1186+
const int I1 = std::get<1>(adjs_all[iat][ad_count]);
1187+
const int start1 = GlobalC::ucell.itiaiw2iwt(T1, I1, 0);
1188+
const double Rcut_AO1 = GlobalC::ORB.Phi[T1].getRcut();
1189+
const ModuleBase::Vector3<double> tau1 = std::get<3>(adjs_all[iat][ad_count]);
1190+
const Atom* atom1 = &GlobalC::ucell.atoms[T1];
1191+
const int nw1_tot = atom1->nw*GlobalV::NPOL;
1192+
1193+
#ifdef _OPENMP
1194+
nlm_tot_thread[ad_count].clear();
1195+
#else
1196+
nlm_tot[ad_count].clear();
1197+
#endif
1198+
1199+
//middle loop : atomic basis on current processor (either row or column)
1200+
const double dist1 = (tau1-tau0).norm() * GlobalC::ucell.lat0;
1201+
if (dist1 > Rcut_Beta + Rcut_AO1)
1202+
{
1203+
continue;
1204+
}
1205+
1206+
for(int iw1=0; iw1<nw1_tot; ++iw1)
1207+
{
1208+
const int iw1_all = start1 + iw1;
1209+
const int iw1_local = GlobalC::ParaO.trace_loc_row[iw1_all];
1210+
const int iw2_local = GlobalC::ParaO.trace_loc_col[iw1_all];
1211+
1212+
if(iw1_local < 0 && iw2_local < 0) continue;
1213+
1214+
const int iw1_0 = iw1/GlobalV::NPOL;
1215+
std::vector<std::vector<double>> nlm;
1216+
//2D, but first dimension is only 1 here
1217+
//for force, the right hand side is the gradient
1218+
//and the first dimension is then 3
1219+
//inner loop : all projectors (L0,M0)
1220+
GlobalC::UOT.snap_psibeta_half(
1221+
GlobalC::ORB,
1222+
GlobalC::ucell.infoNL,
1223+
nlm, tau1, T1,
1224+
atom1->iw2l[ iw1_0 ], // L1
1225+
atom1->iw2m[ iw1_0 ], // m1
1226+
atom1->iw2n[ iw1_0 ], // N1
1227+
GlobalC::ucell.atoms[T0].tau[I0], T0, 0); //R0,T0
1228+
1229+
#ifdef _OPENMP
1230+
nlm_tot_thread[ad_count].insert({iw1_all,nlm[0]});
1231+
#else
1232+
nlm_tot[ad_count].insert({iw1_all,nlm[0]});
1233+
#endif
1234+
}//end iw
1235+
}//end ad
1236+
1237+
//=======================================================
1238+
//Step2:
1239+
//calculate sum_(L0,M0) beta<psi_i|beta><beta|psi_j>
1240+
//and accumulate the value to Hloc_fixed(i,j)
1241+
//=======================================================
1242+
for(int ad1_count=0; ad1_count < adjs_all[iat].size(); ad1_count++)
1243+
{
1244+
const int T1 = std::get<0>(adjs_all[iat][ad1_count]);
1245+
const int I1 = std::get<1>(adjs_all[iat][ad1_count]);
1246+
const int start1 = GlobalC::ucell.itiaiw2iwt(T1, I1, 0);
1247+
const ModuleBase::Vector3<double> tau1 = std::get<3>(adjs_all[iat][ad1_count]);
1248+
const Atom* atom1 = &GlobalC::ucell.atoms[T1];
1249+
const int nw1_tot = atom1->nw*GlobalV::NPOL;
1250+
const double Rcut_AO1 = GlobalC::ORB.Phi[T1].getRcut();
1251+
1252+
for (int ad2_count=0; ad2_count < adjs_all[iat].size(); ad2_count++)
1253+
{
1254+
const int T2 = std::get<0>(adjs_all[iat][ad2_count]);
1255+
const int I2 = std::get<1>(adjs_all[iat][ad2_count]);
1256+
const int start2 = GlobalC::ucell.itiaiw2iwt(T2, I2, 0);
1257+
const ModuleBase::Vector3<double> tau2 = std::get<3>(adjs_all[iat][ad2_count]);
1258+
const Atom* atom2 = &GlobalC::ucell.atoms[T2];
1259+
const int nw2_tot = atom2->nw*GlobalV::NPOL;
1260+
const double Rcut_AO2 = GlobalC::ORB.Phi[T2].getRcut();
1261+
const double dist1 = (tau1-tau0).norm() * GlobalC::ucell.lat0;
1262+
const double dist2 = (tau2-tau0).norm() * GlobalC::ucell.lat0;
1263+
1264+
if (dist1 > Rcut_Beta + Rcut_AO1
1265+
|| dist2 > Rcut_Beta + Rcut_AO2)
1266+
{
1267+
continue;
1268+
}
1269+
1270+
for(int iw1=0; iw1<nw1_tot; ++iw1)
1271+
{
1272+
const int iw1_all = start1 + iw1;
1273+
const int iw1_local = GlobalC::ParaO.trace_loc_row[iw1_all];
1274+
if(iw1_local < 0) continue;
1275+
const int iw1_0 = iw1/GlobalV::NPOL;
1276+
for(int iw2=0; iw2<nw2_tot; ++iw2)
1277+
{
1278+
const int iw2_all = start2 + iw2;
1279+
const int iw2_local = GlobalC::ParaO.trace_loc_col[iw2_all];
1280+
if(iw2_local < 0) continue;
1281+
const int iw2_0 = iw2/GlobalV::NPOL;
1282+
#ifdef _OPENMP
1283+
std::vector<double> nlm1 = nlm_tot_thread[ad1_count][iw1_all];
1284+
std::vector<double> nlm2 = nlm_tot_thread[ad2_count][iw2_all];
1285+
#else
1286+
std::vector<double> nlm1 = nlm_tot[ad1_count][iw1_all];
1287+
std::vector<double> nlm2 = nlm_tot[ad2_count][iw2_all];
1288+
#endif
1289+
1290+
assert(nlm1.size()==nlm2.size());
1291+
#ifdef _OPENMP
1292+
double nlm_thread=0.0;
1293+
#else
1294+
double nlm=0.0;
1295+
#endif
1296+
const int nproj = GlobalC::ucell.infoNL.nproj[T0];
1297+
int ib = 0;
1298+
for(int nb = 0; nb < nproj; nb++)
1299+
{
1300+
const int L0 = GlobalC::ucell.infoNL.Beta[T0].Proj[nb].getL();
1301+
for(int m=0;m<2*L0+1;m++)
1302+
{
1303+
#ifdef _OPENMP
1304+
nlm_thread += nlm1[ib]*nlm2[ib]*GlobalC::ucell.atoms[T0].dion(nb,nb);
1305+
#else
1306+
nlm += nlm1[ib]*nlm2[ib]*GlobalC::ucell.atoms[T0].dion(nb,nb);
1307+
#endif
1308+
ib+=1;
1309+
}
1310+
}
1311+
assert(ib==nlm1.size());
1312+
1313+
const int ir = GlobalC::ParaO.trace_loc_row[ iw1_all ];
1314+
const int ic = GlobalC::ParaO.trace_loc_col[ iw2_all ];
1315+
long index=0;
1316+
if(GlobalV::KS_SOLVER=="genelpa" || GlobalV::KS_SOLVER=="scalapack_gvx")
1317+
{
1318+
index=ic*GlobalC::ParaO.nrow+ir;
1319+
}
1320+
else
1321+
{
1322+
index=ir*GlobalC::ParaO.ncol+ic;
1323+
}
1324+
#ifdef _OPENMP
1325+
Nonlocal_thread[index] += nlm_thread;
1326+
#else
1327+
GlobalC::LM.set_HSgamma(iw1_all,iw2_all,nlm,'N');
1328+
#endif
1329+
}//iw2
1330+
}//iw1
1331+
}//ad2
1332+
}//ad1
1333+
}//end iat
1334+
1335+
#ifdef _OPENMP
1336+
#pragma omp critical(cal_nonlocal)
1337+
{
1338+
for(int i=0; i<GlobalC::ParaO.nloc; i++)
1339+
{
1340+
GlobalC::LM.Hloc_fixed[i] += Nonlocal_thread[i];
1341+
}
1342+
}
1343+
delete[] Nonlocal_thread;
1344+
#endif
1345+
#ifdef _OPENMP
1346+
}
1347+
#endif
1348+
1349+
#ifdef __MKL
1350+
mkl_set_num_threads(mkl_threads);
1351+
#endif
1352+
/*
11371353
for (int T0 = 0; T0 < GlobalC::ucell.ntype; T0++)
11381354
{
11391355
Atom* atom0 = &GlobalC::ucell.atoms[T0];
@@ -1273,6 +1489,7 @@ void LCAO_gen_fixedH::build_Nonlocal_beta_new() //update by liuyu 2021-04-07
12731489
12741490
}//end I0
12751491
}//end T0
1492+
*/
12761493

12771494
ModuleBase::timer::tick ("LCAO_gen_fixedH","build_Nonlocal_beta_new");
12781495
return;

0 commit comments

Comments
 (0)