Perf(LCAO): Various optimizations of detail code (#1901)

Alcanderian · web-flow · commit 889c49995d7a · 2023-02-20T11:31:17.000+08:00
* Perf: optimize folding_vl_k

* Perf: use libm in cal_dm_k

* Perf: optimize folding_fixedH. Optimize find_offset with binary search

* Fix: fix mem leak
diff --git a/source/module_gint/gint_k_pvpr.cpp b/source/module_gint/gint_k_pvpr.cpp
@@ -10,6 +10,7 @@
 #include "module_base/memory.h"
 #include "module_base/timer.h"
 #include "module_base/tool_threading.h"
+#include "module_base/libm/libm.h"
 
 void Gint_k::allocate_pvpR(void)
 {
@@ -174,48 +175,43 @@ void Gint_k::folding_vl_k(const int &ik, LCAO_Matrix *LM)
 
                             // calculate the phase factor exp(ikR).
                             const double arg = (GlobalC::kv.kvec_d[ ik ] * dR) * ModuleBase::TWO_PI;
-                            std::complex<double> phase = std::complex<double>(cos(arg), sin(arg));
+                            double sinp, cosp;
+                            ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                            std::complex<double> phase = std::complex<double>(cosp, sinp);
                             int ixxx = DM_start + GlobalC::GridT.find_R2st[iat][nad];
-                            for(int iw=0; iw<atom1->nw; iw++)
+                            
+                            if(GlobalV::NSPIN!=4)
                             {
-                                // iw1_lo
-                                if(GlobalV::NSPIN!=4)
+                                for(int iw=0; iw<atom1->nw; iw++)
                                 {
                                     std::complex<double> *vij = pvp[GlobalC::GridT.trace_lo[start1+iw]];
-
                                     int* iw2_lo = &GlobalC::GridT.trace_lo[start2];
-                                    int* iw2_end = iw2_lo + atom2->nw;
-
                                     // get the <phi | V | phi>(R) Hamiltonian.
                                     double *vijR = &pvpR_reduced[0][ixxx];
-                                    for(; iw2_lo<iw2_end; ++iw2_lo, ++vijR)
+                                    for(int iw2 = 0; iw2<atom2->nw; ++iw2)
                                     {
-                                        vij[iw2_lo[0]] += vijR[0] * phase; 
+                                        vij[iw2_lo[iw2]] += vijR[iw2] * phase; 
                                     }
+                                    ixxx += atom2->nw;
                                 }
-                                else
+                            }
+                            else
+                            {
+                                for(int iw=0; iw<atom1->nw; iw++)
                                 {
-                                    std::complex<double> *vij[4];
-                                    for(int spin=0;spin<4;spin++)
-                                        vij[spin] = pvp_nc[spin][GlobalC::GridT.trace_lo[start1]/GlobalV::NPOL + iw];
-
                                     int iw2_lo = GlobalC::GridT.trace_lo[start2]/GlobalV::NPOL;
-                                    int iw2_end = iw2_lo + atom2->nw;
-
-                                    double *vijR[4];
                                     for(int spin = 0;spin<4;spin++) 
                                     {
-                                        vijR[spin] = &pvpR_reduced[spin][ixxx];
-                                    }
-                                    for(; iw2_lo<iw2_end; ++iw2_lo, ++vijR[0], ++vijR[1],++vijR[2],++vijR[3])
-                                    {
-                                        for(int spin =0;spin<4;spin++)
+                                        auto vij = pvp_nc[spin][GlobalC::GridT.trace_lo[start1]/GlobalV::NPOL + iw];
+                                        auto vijR = &pvpR_reduced[spin][ixxx];
+                                        auto vijs = &vij[iw2_lo];
+                                        for(int iw2 = 0; iw2<atom2->nw; ++iw2)
                                         {
-                                            vij[spin][iw2_lo] += vijR[spin][0] * phase; 
+                                            vijs[iw2] += vijR[iw2] * phase; 
                                         }
-                                    }                                    
+                                    }
+                                    ixxx += atom2->nw;
                                 }
-                                ixxx += atom2->nw;
                             }
                             ++nad;
                         }// end distane<rcut
@@ -231,112 +227,144 @@ void Gint_k::folding_vl_k(const int &ik, LCAO_Matrix *LM)
     // Distribution of data.
     ModuleBase::timer::tick("Gint_k","Distri");
     std::complex<double>* tmp = new std::complex<double>[GlobalV::NLOCAL];
+    const double sign_table[2] = {1.0, -1.0};
 #ifdef _OPENMP
 #pragma omp parallel
 {
 #endif
     for (int i=0; i<GlobalV::NLOCAL; i++)
     {
-#ifdef _OPENMP
-#pragma omp for schedule(static, 256)
-#endif
-        for (int j=0; j<GlobalV::NLOCAL; j++)
-        {
-            tmp[j] = 0;
-        }
+        int i_flag = i & 1; // i % 2 == 0
         const int mug = GlobalC::GridT.trace_lo[i];
         const int mug0 = mug/GlobalV::NPOL;
         // if the row element is on this processor.
         if (mug >= 0)
         {
+            if(GlobalV::NSPIN!=4)
+            {
 #ifdef _OPENMP
-#pragma omp for schedule(static, 256)
+#pragma omp for
 #endif
-            for (int j=0; j<GlobalV::NLOCAL; j++)
-            {
-                const int nug = GlobalC::GridT.trace_lo[j];
-                const int nug0 = nug/GlobalV::NPOL;
-                // if the col element is on this processor.
-                if (nug >=0)
+                for (int j=0; j<GlobalV::NLOCAL; j++)
                 {
-                    if (mug <= nug)
+                    tmp[j] = 0;
+                    const int nug = GlobalC::GridT.trace_lo[j];
+                    const int nug0 = nug/GlobalV::NPOL;
+                    // if the col element is on this processor.
+                    if (nug >=0)
                     {
-                        if(GlobalV::NSPIN!=4)
+                        if (mug <= nug)
                         {
                             // pvp is symmetric, only half is calculated.
                             tmp[j] = pvp[mug][nug];
                         }
                         else
                         {
-                            if(i%2==0&&j%2==0)
-                            {
-                                //spin = 0;
-                                tmp[j] = pvp_nc[0][mug0][nug0]+pvp_nc[3][mug0][nug0];
-                            }	
-                            else if(i%2==1&&j%2==1)
-                            {
-                                //spin = 3;
-                                tmp[j] = pvp_nc[0][mug0][nug0]-pvp_nc[3][mug0][nug0];
-                            }
-                            else if(i%2==0&&j%2==1)
-                            {
-                                // spin = 1;
-                                if(!GlobalV::DOMAG) tmp[j] = 0;
-                                else tmp[j] = pvp_nc[1][mug0][nug0] - std::complex<double>(0.0,1.0) * pvp_nc[2][mug0][nug0];
-                            }
-                            else if(i%2==1&&j%2==0) 
+                            // need to get elements from the other half.
+                            // I have question on this! 2011-02-22
+                            tmp[j] = conj(pvp[nug][mug]);
+                        }
+                    }
+                }
+            }
+            else
+            {
+                if (GlobalV::DOMAG)
+                {
+#ifdef _OPENMP
+#pragma omp for
+#endif
+                    for (int j=0; j<GlobalV::NLOCAL; j++)
+                    {
+                        tmp[j] = 0;
+                        int j_flag = j & 1; // j % 2 == 0
+                        int ij_same = i_flag ^ j_flag ? 0 : 1;
+                        const int nug = GlobalC::GridT.trace_lo[j];
+                        const int nug0 = nug/GlobalV::NPOL;
+                        double sign = sign_table[j_flag];
+                        // if the col element is on this processor.
+                        if (nug >=0)
+                        {
+                            if (mug <= nug)
                             {
-                                //spin = 2;
-                                if(!GlobalV::DOMAG) tmp[j] = 0;
-                                else tmp[j] = pvp_nc[1][mug0][nug0] + std::complex<double>(0.0,1.0) * pvp_nc[2][mug0][nug0];
+                                if (ij_same)
+                                {
+                                    //spin = 0;
+                                    //spin = 3;
+                                    tmp[j] = pvp_nc[0][mug0][nug0]+sign*pvp_nc[3][mug0][nug0];
+                                }
+                                else
+                                {
+                                    // spin = 1;
+                                    // spin = 2;
+                                    tmp[j] = pvp_nc[1][mug0][nug0] + sign*std::complex<double>(0.0,1.0) * pvp_nc[2][mug0][nug0];
+                                }
                             }
                             else
                             {
-                                ModuleBase::WARNING_QUIT("Gint_k::folding_vl_k_nc","index is wrong!");
-                            }                            
+                                if (ij_same)
+                                {
+                                    //spin = 0;
+                                    //spin = 3;
+                                    tmp[j] = conj(pvp_nc[0][nug0][mug0]+sign*pvp_nc[3][nug0][mug0]);
+                                }
+                                else
+                                {
+                                    // spin = 1;
+                                    //spin = 2;
+                                    tmp[j] = conj(pvp_nc[1][nug0][mug0] + sign*std::complex<double>(0.0,1.0) * pvp_nc[2][nug0][mug0]);
+                                }
+                            }
                         }
                     }
-                    else
+                }
+                else
+                {
+#ifdef _OPENMP
+#pragma omp for
+#endif
+                    for (int j=0; j<GlobalV::NLOCAL; j++)
                     {
-                        // need to get elements from the other half.
-                        // I have question on this! 2011-02-22
-                        if(GlobalV::NSPIN!=4)
-                        {
-                            tmp[j] = conj(pvp[nug][mug]);
-                        }
-                        else
+                        tmp[j] = 0;
+                        int j_flag = j & 1; // j % 2 == 0
+                        int ij_same = i_flag ^ j_flag ? 0 : 1;
+
+                        if (!ij_same)
+                            continue;
+
+                        const int nug = GlobalC::GridT.trace_lo[j];
+                        const int nug0 = nug/GlobalV::NPOL;
+                        double sign = sign_table[j_flag];
+                        // if the col element is on this processor.
+                        if (nug >=0)
                         {
-                            if(i%2==0&&j%2==0)
+                            if (mug <= nug)
                             {
                                 //spin = 0;
-                                tmp[j] = conj(pvp_nc[0][nug0][mug0]+pvp_nc[3][nug0][mug0]);
-                            }	
-                            else if(i%2==1&&j%2==1)
-                            {
                                 //spin = 3;
-                                tmp[j] = conj(pvp_nc[0][nug0][mug0]-pvp_nc[3][nug0][mug0]);
-                            }
-                            else if(i%2==1&&j%2==0)
-                            {
-                                // spin = 1;
-                                if(!GlobalV::DOMAG) tmp[j] = 0;
-                                else tmp[j] = conj(pvp_nc[1][nug0][mug0] - std::complex<double>(0.0,1.0) * pvp_nc[2][nug0][mug0]);
-                            }
-                            else if(i%2==0&&j%2==1) 
-                            {
-                                //spin = 2;
-                                if(!GlobalV::DOMAG) tmp[j] = 0;
-                                else tmp[j] = conj(pvp_nc[1][nug0][mug0] + std::complex<double>(0.0,1.0) * pvp_nc[2][nug0][mug0]);
+                                tmp[j] = pvp_nc[0][mug0][nug0]+sign*pvp_nc[3][mug0][nug0];
                             }
                             else
                             {
-                                ModuleBase::WARNING_QUIT("Gint_k::folding_vl_k_nc","index is wrong!");
-                            }                           
+                                //spin = 0;
+                                //spin = 3;
+                                tmp[j] = conj(pvp_nc[0][nug0][mug0]+sign*pvp_nc[3][nug0][mug0]);
+                            }
                         }
                     }
                 }
             }
         }
+        else
+        {
+#ifdef _OPENMP
+#pragma omp for
+#endif
+            for (int j=0; j<GlobalV::NLOCAL; j++)
+            {
+                tmp[j] = 0;
+            }
+        }
 #ifdef _OPENMP
 #pragma omp single
 {
@@ -352,7 +380,7 @@ void Gint_k::folding_vl_k(const int &ik, LCAO_Matrix *LM)
         // according to the HPSEPS's 2D distribution methods.
         //-----------------------------------------------------
 #ifdef _OPENMP
-#pragma omp for schedule(static, 256)
+#pragma omp for
 #endif
         for (int j=0; j<GlobalV::NLOCAL; j++)
         {
diff --git a/source/module_gint/gint_vl.cpp b/source/module_gint/gint_vl.cpp
@@ -291,8 +291,7 @@ void Gint::cal_meshball_vlocal_gamma(
 	}
 }
 
-inline int find_offset(const int id1, const int id2, const int iat1, const int iat2,
-				int* find_start, int* find_end)
+inline int find_offset(const int id1, const int id2, const int iat1, const int iat2)
 {
 	const int R1x=GlobalC::GridT.ucell_index2x[id1];
 	const int R2x=GlobalC::GridT.ucell_index2x[id2];
@@ -305,16 +304,8 @@ inline int find_offset(const int id1, const int id2, const int iat1, const int i
 	const int dRz=R1z-R2z;
 
 	const int index=GlobalC::GridT.cal_RindexAtom(dRx, dRy, dRz, iat2);
-	
-	int offset=-1;
-	for(int* find=find_start; find < find_end; ++find)
-	{
-		if( find[0] == index )
-		{
-			offset = find - find_start;
-			break;
-		}
-	}
+
+	const int offset = GlobalC::GridT.binary_search_find_R2_offset(index, iat1);
 
 	assert(offset < GlobalC::GridT.nad[iat1]);
 	return offset;
@@ -348,9 +339,6 @@ void Gint::cal_meshball_vlocal_k(
 		const int T1 = GlobalC::ucell.iat2it[iat1];
 		const int id1 = GlobalC::GridT.which_unitcell[mcell_index1];
 		const int DM_start = GlobalC::GridT.nlocstartg[iat1];
-		// nad : how many adjacent atoms for atom 'iat'
-		int* find_start = GlobalC::GridT.find_R2[iat1];
-		int* find_end = GlobalC::GridT.find_R2[iat1] + GlobalC::GridT.nad[iat1];
 		for(int ia2=0; ia2<na_grid; ++ia2)
 		{
 			const int mcell_index2 = GlobalC::GridT.bcell_start[grid_index] + ia2;
@@ -373,8 +361,7 @@ void Gint::cal_meshball_vlocal_k(
 				const int mcell_index2 = GlobalC::GridT.bcell_start[grid_index] + ia2;
 				const int id2 = GlobalC::GridT.which_unitcell[mcell_index2];
 				int offset;
-				offset=find_offset(id1, id2, iat1, iat2,
-						find_start, find_end);
+				offset=find_offset(id1, id2, iat1, iat2);
 
 				const int iatw = DM_start + GlobalC::GridT.find_R2st[iat1][offset];	
 
diff --git a/source/module_gint/grid_technique.cpp b/source/module_gint/grid_technique.cpp
@@ -50,9 +50,11 @@ Grid_Technique::~Grid_Technique()
 		{
 			delete[] find_R2[iat];
 			delete[] find_R2st[iat];
+			delete[] find_R2_sorted_index[iat];
 		}
 		delete[] find_R2;
 		delete[] find_R2st;
+		delete[] find_R2_sorted_index;
 	}
 }
 
diff --git a/source/module_gint/grid_technique.h b/source/module_gint/grid_technique.h
@@ -60,8 +60,10 @@ class Grid_Technique : public Grid_MeshBall
     
     int* nad; // number of adjacent atoms for each atom.
 	int **find_R2;
+	int **find_R2_sorted_index;
 	int **find_R2st;
     bool allocate_find_R2;
+	int binary_search_find_R2_offset(int val, int iat);
 
 	//indexes for nnrg -> orbital index + R index
 	std::vector<gridIntegral::gridIndex> nnrg_index;
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/DM_k.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/DM_k.cpp
@@ -4,6 +4,7 @@
 #include "src_parallel/parallel_common.h"
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
 #include "local_orbital_charge.h"
+#include "module_base/libm/libm.h"
 
 #ifdef __MKL
 #include <mkl_service.h>
@@ -92,7 +93,7 @@ inline void cal_DM_ATOM(const Grid_Technique &gt,
             const int start2 = GlobalC::ucell.itiaiw2iwt(T2, I2, 0);
             const int iw2_lo = gt.trace_lo[start2];
             const int nw2 = atom2->nw;
-            std::complex<double> exp_R = exp(fac
+            std::complex<double> exp_R = ModuleBase::libm::exp(fac
                                              * (GlobalC::kv.kvec_d[ik].x * RA.info[ia1][ia2][0]
                                                 + GlobalC::kv.kvec_d[ik].y * RA.info[ia1][ia2][1]
                                                 + GlobalC::kv.kvec_d[ik].z * RA.info[ia1][ia2][2]));
@@ -183,7 +184,7 @@ inline void cal_DM_ATOM_nc(const Grid_Technique &gt,
                     const int start2 = GlobalC::ucell.itiaiw2iwt(T2, I2, 0);
                     const int iw2_lo = gt.trace_lo[start2] / GlobalV::NPOL + gt.lgd / GlobalV::NPOL * is2;
                     const int nw2 = atom2->nw;
-                    std::complex<double> exp_R = exp(fac
+                    std::complex<double> exp_R = ModuleBase::libm::exp(fac
                                                      * (GlobalC::kv.kvec_d[ik].x * RA.info[ia1][ia2][0]
                                                         + GlobalC::kv.kvec_d[ik].y * RA.info[ia1][ia2][1]
                                                         + GlobalC::kv.kvec_d[ik].z * RA.info[ia1][ia2][2]));
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nnr.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nnr.cpp

Original file line number	Diff line number	Diff line change
`@@ -50,9 +50,11 @@ Grid_Technique::~Grid_Technique()`
`50`	`50`	`{`
`51`	`51`	`delete[] find_R2[iat];`
`52`	`52`	`delete[] find_R2st[iat];`
	`53`	`+ delete[] find_R2_sorted_index[iat];`
`53`	`54`	`}`
`54`	`55`	`delete[] find_R2;`
`55`	`56`	`delete[] find_R2st;`
	`57`	`+ delete[] find_R2_sorted_index;`
`56`	`58`	`}`
`57`	`59`	`}`
`58`	`60`