optimize func_folding

dzzz2001 · dzzz2001 · commit 9c57cbe72758 · 2025-03-26T16:57:40.000+08:00
diff --git a/source/module_elecstate/module_dm/density_matrix.cpp b/source/module_elecstate/module_dm/density_matrix.cpp
@@ -166,13 +166,10 @@ void DensityMatrix<std::complex<double>, double>::cal_DMR(const int ik_in)
                     if(PARAM.inp.nspin != 4)
                     {
                         double* target_DMR_mat = target_DMR_mat_vec[ir];
-                        for(int irow = 0; irow < row_size; ++irow)
+                        for(int i = 0; i < mat_size; i++)
                         {
-                            for(int icol = 0; icol < col_size; ++icol)
-                            {
-                                target_DMR_mat[irow * col_size + icol] += kphase.real() * tmp_DMK_mat[irow * col_size + icol].real() 
-                                    - kphase.imag() * tmp_DMK_mat[irow * col_size + icol].imag();
-                            }
+                            target_DMR_mat[i] += kphase.real() * tmp_DMK_mat[i].real() 
+                                    - kphase.imag() * tmp_DMK_mat[i].imag();
                         }
                     } else if(PARAM.inp.nspin == 4)
                     {
diff --git a/source/module_hamilt_lcao/module_hcontainer/atom_pair.h b/source/module_hamilt_lcao/module_hcontainer/atom_pair.h
@@ -118,6 +118,12 @@ class AtomPair
     */
     void set_zero();
 
+    /**
+     * @brief get begin index of this AtomPair
+    */
+    int get_begin_row() const { return this->row_ap; }
+    int get_begin_col() const { return this->col_ap; }
+
     /**
      * @brief get col_size for this AtomPair
     */
diff --git a/source/module_hamilt_lcao/module_hcontainer/func_folding.cpp b/source/module_hamilt_lcao/module_hcontainer/func_folding.cpp
@@ -15,7 +15,7 @@ template<typename TR>
 void folding_HR(const hamilt::HContainer<TR>& hR,
                 std::complex<double>* hk,
                 const ModuleBase::Vector3<double>& kvec_d_in,
-                const int ncol,
+                const int hk_ld,
                 const int hk_type)
 {
 #ifdef _OPENMP
@@ -24,19 +24,54 @@ void folding_HR(const hamilt::HContainer<TR>& hR,
     for (int i = 0; i < hR.size_atom_pairs(); ++i)
     {
         hamilt::AtomPair<TR>& tmp = hR.get_atom_pair(i);
-        for(int ir = 0;ir < tmp.get_R_size(); ++ir )
+        const int row_size = tmp.get_row_size();
+        const int col_size = tmp.get_col_size();
+        // copy hk to hk_type
+        // hk_tmp is row-major and stored contiguously in memory,
+        // so copy hr to hk_tmp is faster than copy hr to hk
+        std::vector<std::complex<double>> hk_mat_tmp(row_size * col_size, 0);
+
+        // copy hr to hk_tmp
+        for(int ir = 0; ir < tmp.get_R_size(); ++ir)
         {
             const ModuleBase::Vector3<int> r_index = tmp.get_R_index(ir);
+            TR* hr_mat = tmp.get_pointer(ir);
             // cal k_phase
             // if TK==std::complex<double>, kphase is e^{ikR}
             const ModuleBase::Vector3<double> dR(r_index.x, r_index.y, r_index.z);
             const double arg = (kvec_d_in * dR) * ModuleBase::TWO_PI;
             double sinp, cosp;
             ModuleBase::libm::sincos(arg, &sinp, &cosp);
             std::complex<double> kphase = std::complex<double>(cosp, sinp);
+            
+            for(int i = 0; i < row_size * col_size; ++i)
+            {
+                hk_mat_tmp[i] += kphase * hr_mat[i];
+            }
+        }
 
-            tmp.find_R(r_index);
-            tmp.add_to_matrix(hk, ncol, kphase, hk_type);
+        // copy hk_tmp to hk
+        if (hk_type == 0)
+        {
+            std::complex<double>* hk_mat = hk + tmp.get_begin_row() * hk_ld + tmp.get_begin_col();
+            for(int irow = 0; irow < row_size; ++irow)
+            {
+                for(int icol = 0; icol < col_size; ++icol)
+                {
+                    hk_mat[irow * hk_ld + icol] += hk_mat_tmp[irow * col_size + icol];
+                }
+            }
+        }
+        else if(hk_type == 1)
+        {
+            std::complex<double>* hk_mat = hk + tmp.get_begin_col() * hk_ld + tmp.get_begin_row();
+            for(int icol = 0; icol < col_size; ++icol)
+            {
+                for(int irow = 0; irow < row_size; ++irow)
+                {
+                    hk_mat[icol * hk_ld + irow] += hk_mat_tmp[irow * col_size + icol];
+                }
+            }
         }
     }
     /*for (int i = 0; i < hR.size_R_loop(); ++i)
@@ -82,7 +117,7 @@ template void folding_HR<double>(const hamilt::HContainer<double>& hR,
 void folding_HR(const hamilt::HContainer<double>& hR,
                 double* hk,
                 const ModuleBase::Vector3<double>& kvec_d_in,
-                const int ncol,
+                const int hk_ld,
                 const int hk_type)
 {
 // in ABACUS, this function works with gamma-only case.
@@ -97,7 +132,7 @@ void folding_HR(const hamilt::HContainer<double>& hR,
         double kphase = 1.0;
 
         // Hk = HR 
-        hR.get_atom_pair(i).add_to_matrix(hk, ncol, kphase, hk_type);
+        hR.get_atom_pair(i).add_to_matrix(hk, hk_ld  , kphase, hk_type);
     }
 }
 
diff --git a/source/module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h b/source/module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h
@@ -16,13 +16,13 @@ template<typename TR>
 void folding_HR(const hamilt::HContainer<TR>& hR,
                 std::complex<double>* hk,
                 const ModuleBase::Vector3<double>& kvec_d_in,
-                const int ncol,
+                const int hk_ld,
                 const int hk_type);
 
 void folding_HR(const hamilt::HContainer<double>& hR,
                 double* hk,
                 const ModuleBase::Vector3<double>& kvec_d_in,
-                const int ncol,
+                const int hk_ld,
                 const int hk_type);
 
 #ifdef __MPI

Original file line number	Diff line number	Diff line change
`@@ -166,13 +166,10 @@ void DensityMatrix<std::complex<double>, double>::cal_DMR(const int ik_in)`
`166`	`166`	`if(PARAM.inp.nspin != 4)`
`167`	`167`	`{`
`168`	`168`	`double* target_DMR_mat = target_DMR_mat_vec[ir];`
`169`		`- for(int irow = 0; irow < row_size; ++irow)`
	`169`	`+ for(int i = 0; i < mat_size; i++)`
`170`	`170`	`{`
`171`		`- for(int icol = 0; icol < col_size; ++icol)`
`172`		`- {`
`173`		`- target_DMR_mat[irow * col_size + icol] += kphase.real() * tmp_DMK_mat[irow * col_size + icol].real()`
`174`		`- - kphase.imag() * tmp_DMK_mat[irow * col_size + icol].imag();`
`175`		`- }`
	`171`	`+ target_DMR_mat[i] += kphase.real() * tmp_DMK_mat[i].real()`
	`172`	`+ - kphase.imag() * tmp_DMK_mat[i].imag();`
`176`	`173`	`}`
`177`	`174`	`} else if(PARAM.inp.nspin == 4)`
`178`	`175`	`{`