55#include " module_lr/utils/lr_util.h"
66#include " module_lr/utils/lr_util_xc.hpp"
77#include < set>
8+ #include < chrono>
89#include " module_io/cube_io.h"
910#ifdef USE_LIBXC
1011#include < xc.h>
1112#include " module_hamilt_general/module_xc/xc_functional_libxc.h"
1213#endif
14+ #ifdef _OPENMP
15+ #include < omp.h>
16+ #endif
1317
1418LR::KernelXC::KernelXC (const ModulePW::PW_Basis& rho_basis,
1519 const UnitCell& ucell,
@@ -66,6 +70,29 @@ LR::KernelXC::KernelXC(const ModulePW::PW_Basis& rho_basis,
6670#endif
6771}
6872
73+ template <typename T>
74+ inline void add_op (const T* const src1, const T* const src2, T* const dst, const int size)
75+ {
76+ #ifdef _OPENMP
77+ #pragma omp parallel for schedule(static, 4096)
78+ #endif
79+ for (int i = 0 ;i < size;++i)
80+ {
81+ dst[i] = src1[i] + src2[i];
82+ }
83+ }
84+ template <typename T>
85+ inline void add_op (const std::vector<T>& src1, const std::vector<T>& src2, std::vector<T>& dst)
86+ {
87+ assert (dst.size () >= src1.size () && src2.size () >= src1.size ());
88+ add_op (src1.data (), src2.data (), dst.data (), src1.size ());
89+ }
90+ template <typename T>
91+ inline void add_assign_op (const std::vector<T>& src, std::vector<T>& dst)
92+ {
93+ add_op (src, dst, dst);
94+ }
95+
6996#ifdef USE_LIBXC
7097void LR::KernelXC::f_xc_libxc (const int & nspin, const double & omega, const double & tpiba, const double * const * const rho_gs, const double * const rho_core)
7198{
@@ -115,7 +142,8 @@ void LR::KernelXC::f_xc_libxc(const int& nspin, const double& omega, const doubl
115142#ifdef _OPENMP
116143#pragma omp parallel for schedule(static, 1024)
117144#endif
118- for (int ir = 0 ; ir < nrxx; ++ir) rhor[ir] = rho[ir * nspin + is];
145+ for (int ir = 0 ; ir < nrxx; ++ir) { rhor[ir] = rho[ir * nspin + is];
146+ }
119147 gradrho[is].resize (nrxx);
120148 LR_Util::grad (rhor.data (), gradrho[is].data (), rho_basis_, tpiba);
121149 }
@@ -126,8 +154,9 @@ void LR::KernelXC::f_xc_libxc(const int& nspin, const double& omega, const doubl
126154#ifdef _OPENMP
127155#pragma omp parallel for schedule(static, 1024)
128156#endif
129- for (int ir = 0 ; ir < nrxx; ++ir)
157+ for (int ir = 0 ; ir < nrxx; ++ir) {
130158 sigma[ir] = gradrho[0 ][ir] * gradrho[0 ][ir];
159+ }
131160 }
132161 else
133162 {
@@ -144,13 +173,13 @@ void LR::KernelXC::f_xc_libxc(const int& nspin, const double& omega, const doubl
144173 }
145174 // -----------------------------------------------------------------------------------
146175 // ==================== XC Kernels (f_xc)=============================
147- this ->vrho_ .resize (nspin * nrxx);
148- this ->v2rho2_ .resize (((1 == nspin) ? 1 : 3 ) * nrxx);// (nrxx* ((1 == nspin) ? 1 : 3)): 00, 01, 11
176+ this ->vrho_ .resize (nspin * nrxx, 0 . );
177+ this ->v2rho2_ .resize (((1 == nspin) ? 1 : 3 ) * nrxx, 0 . );// (nrxx* ((1 == nspin) ? 1 : 3)): 00, 01, 11
149178 if (is_gga)
150179 {
151- this ->vsigma_ .resize (((1 == nspin) ? 1 : 3 ) * nrxx);// (nrxx*): 2 for rho * 3 for sigma: 00, 01, 02, 10, 11, 12
152- this ->v2rhosigma_ .resize (((1 == nspin) ? 1 : 6 ) * nrxx); // (nrxx*): 2 for rho * 3 for sigma: 00, 01, 02, 10, 11, 12
153- this ->v2sigma2_ .resize (((1 == nspin) ? 1 : 6 ) * nrxx); // (nrxx* ((1 == nspin) ? 1 : 6)): 00, 01, 02, 11, 12, 22
180+ this ->vsigma_ .resize (((1 == nspin) ? 1 : 3 ) * nrxx, 0 . );// (nrxx*): 2 for rho * 3 for sigma: 00, 01, 02, 10, 11, 12
181+ this ->v2rhosigma_ .resize (((1 == nspin) ? 1 : 6 ) * nrxx, 0 . ); // (nrxx*): 2 for rho * 3 for sigma: 00, 01, 02, 10, 11, 12
182+ this ->v2sigma2_ .resize (((1 == nspin) ? 1 : 6 ) * nrxx, 0 . ); // (nrxx* ((1 == nspin) ? 1 : 6)): 00, 01, 02, 11, 12, 22
154183 }
155184 // MetaGGA ...
156185
@@ -163,105 +192,85 @@ void LR::KernelXC::f_xc_libxc(const int& nspin, const double& omega, const doubl
163192
164193 // cut off grho if not LDA (future subfunc)
165194
195+ // Libxc interfaces overwrite (instead of add onto) the output arrays, so we need temporary copies
196+ std::vector<double > vrho_tmp (this ->vrho_ .size ());
197+ std::vector<double > v2rho2_tmp (this ->v2rho2_ .size ());
198+ std::vector<double > vsigma_tmp (this ->vsigma_ .size ());
199+ std::vector<double > v2rhosigma_tmp (this ->v2rhosigma_ .size ());
200+ std::vector<double > v2sigma2_tmp (this ->v2sigma2_ .size ());
166201 switch (func.info ->family )
167202 {
168203 case XC_FAMILY_LDA:
169- xc_lda_vxc (&func, nrxx, rho.data (), vrho_ .data ());
170- xc_lda_fxc (&func, nrxx, rho.data (), v2rho2_ .data ());
204+ xc_lda_vxc (&func, nrxx, rho.data (), vrho_tmp .data ());
205+ xc_lda_fxc (&func, nrxx, rho.data (), v2rho2_tmp .data ());
171206 break ;
172207 case XC_FAMILY_GGA:
173208 case XC_FAMILY_HYB_GGA:
174- xc_gga_vxc (&func, nrxx, rho.data (), sigma.data (), vrho_ .data (), vsigma_ .data ());
175- xc_gga_fxc (&func, nrxx, rho.data (), sigma.data (), v2rho2_ .data (), v2rhosigma_ .data (), v2sigma2_ .data ());
209+ xc_gga_vxc (&func, nrxx, rho.data (), sigma.data (), vrho_tmp .data (), vsigma_tmp .data ());
210+ xc_gga_fxc (&func, nrxx, rho.data (), sigma.data (), v2rho2_tmp .data (), v2rhosigma_tmp .data (), v2sigma2_tmp .data ());
176211 break ;
177212 default :
178213 throw std::domain_error (" func.info->family =" + std::to_string (func.info ->family )
179214 + " unfinished in " + std::string (__FILE__) + " line " + std::to_string (__LINE__));
180215 break ;
181216 }
182- // some formulas for GGA
183- if (func.info ->family == XC_FAMILY_GGA || func.info ->family == XC_FAMILY_HYB_GGA)
184- {
185- const std::vector<double >& v2r2 = this ->v2rho2_ ;
186- const std::vector<double >& v2rs = this ->v2rhosigma_ ;
187- const std::vector<double >& v2s2 = this ->v2sigma2_ ;
188- const std::vector<double >& vs = this ->vsigma_ ;
189- const double tpiba2 = tpiba * tpiba;
217+ // add onto the total components
218+ // auto start = std::chrono::high_resolution_clock::now();
219+ add_assign_op (vrho_tmp, this ->vrho_ );
220+ add_assign_op (v2rho2_tmp, this ->v2rho2_ );
221+ add_assign_op (vsigma_tmp, this ->vsigma_ );
222+ add_assign_op (v2rhosigma_tmp, this ->v2rhosigma_ );
223+ add_assign_op (v2sigma2_tmp, this ->v2sigma2_ );
224+ // auto end = std::chrono::high_resolution_clock::now();
225+ // auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
226+ // std::cout << "Time elapsed adding XC components: " << duration.count() << " ms\n";
227+ } // end for( xc_func_type &func : funcs )
228+
229+ XC_Functional_Libxc::finish_func (funcs);
190230
191- if (1 == nspin)
231+ // some postprocess if there're GGA funcs in the list
232+ if (is_gga)
233+ {
234+ const std::vector<double >& v2r2 = this ->v2rho2_ ;
235+ const std::vector<double >& v2rs = this ->v2rhosigma_ ;
236+ const std::vector<double >& v2s2 = this ->v2sigma2_ ;
237+ const std::vector<double >& vs = this ->vsigma_ ;
238+ const double tpiba2 = tpiba * tpiba;
239+
240+ if (nspin == 1 )
241+ {
242+ // 0. drho
243+ this ->drho_gs_ = gradrho[0 ];
244+ // 1. $2f^{\rho\sigma}*\nabla\rho$
245+ this ->v2rhosigma_2drho_ .resize (nrxx);
246+ #ifdef _OPENMP
247+ #pragma omp parallel for schedule(static, 4096)
248+ #endif
249+ for (size_t i = 0 ; i < nrxx; ++i)
192250 {
193- using V3 = ModuleBase::Vector3<double >;
194- // 0. drho
195- this ->drho_gs_ = gradrho[0 ];
196- // 1. $2f^{\rho\sigma}*\nabla\rho$
197- this ->v2rhosigma_2drho_ .resize (nrxx);
198- std::transform (gradrho[0 ].begin (), gradrho[0 ].end (), v2rs.begin (), this ->v2rhosigma_2drho_ .begin (),
199- [](const V3& a, const V3& b) {return a * b * 2 .; });
200- // 2. $4f^{\sigma\sigma}*\nabla\rho$
201- this ->v2sigma2_4drho_ .resize (nrxx);
202- std::transform (sigma.begin (), sigma.end (), v2s2.begin (), v2sigma2_4drho_.begin (),
203- [](const V3& a, const V3& b) {return a * b * 4 .; });
251+ this ->v2rhosigma_2drho_ [i] = gradrho[0 ][i] * v2rs[i] * 2 .;
204252 }
205- // else if (2 == nspin) // wrong, to be fixed
206- // {
207- // // 1. $\nabla\cdot(f^{\rho\sigma}*\nabla\rho)$
208- // std::vector<double> div_v2rhosigma_gdrho_r(3 * nrxx);
209- // std::vector<ModuleBase::Vector3<double>> v2rhosigma_gdrho_r(3 * nrxx);
210- // for (int ir = 0; ir < nrxx; ++ir)
211- // {
212- // v2rhosigma_gdrho_r[ir] = gradrho[0][ir] * v2rs.at(ir * 6) * 4.0
213- // + gradrho[1][ir] * v2rs.at(ir * 6 + 1) * 2.0; //up-up
214- // v2rhosigma_gdrho_r[nrxx + ir] = gradrho[0][ir] * (v2rs.at(ir * 6 + 3) * 2.0 + v2rs.at(ir * 6 + 1))
215- // + gradrho[1][ir] * (v2rs.at(ir * 6 + 2) * 2.0 + v2rs.at(ir * 6 + 4)); //up-down
216- // v2rhosigma_gdrho_r[2 * nrxx + ir] = gradrho[1][ir] * v2rs.at(ir * 6 + 5) * 4.0
217- // + gradrho[0][ir] * v2rs.at(ir * 6 + 4) * 2.0; //down-down
218- // }
219- // for (int isig = 0;isig < 3;++isig)
220- // XC_Functional::grad_dot(v2rhosigma_gdrho_r.data() + isig * nrxx, div_v2rhosigma_gdrho_r.data() + isig * nrxx, chg_gs.rhopw, tpiba);
221- // // 2. $\nabla^2(f^{\sigma\sigma}*\sigma)$
222- // std::vector<double> v2sigma2_sigma_r(3 * nrxx);
223- // for (int ir = 0; ir < nrxx; ++ir)
224- // {
225- // v2sigma2_sigma_r[ir] = v2s2.at(ir * 6) * sigma[ir * 3] * 4.0
226- // + v2s2.at(ir * 6 + 1) * sigma[ir * 3 + 1] * 4.0
227- // + v2s2.at(ir * 6 + 3) * sigma[ir * 3 + 2]; //up-up
228- // v2sigma2_sigma_r[nrxx + ir] = v2s2.at(ir * 6 + 1) * sigma[ir * 3] * 2.0
229- // + v2s2.at(ir * 6 + 4) * sigma[ir * 3 + 2] * 2.0
230- // + (v2s2.at(ir * 6 + 2) * 4.0 + v2s2.at(ir * 6 + 3)) * sigma[ir * 3 + 1]; //up-down
231- // v2sigma2_sigma_r[2 * nrxx + ir] = v2s2.at(ir * 6 + 5) * sigma[ir * 3 + 2] * 4.0
232- // + v2s2.at(ir * 6 + 4) * sigma[ir * 3 + 1] * 4.0
233- // + v2s2.at(ir * 6 + 3) * sigma[ir * 3]; //down-down
234- // }
235- // for (int isig = 0;isig < 3;++isig)
236- // LR_Util::lapl(v2sigma2_sigma_r.data() + isig * nrxx, v2sigma2_sigma_r.data() + isig * nrxx, *(chg_gs.rhopw), tpiba2);
237- // // 3. $\nabla^2(v^\sigma)$
238- // std::vector<double> lap_vsigma(3 * nrxx);
239- // for (int ir = 0;ir < nrxx;++ir)
240- // {
241- // lap_vsigma[ir] = vs.at(ir * 3) * 2.0;
242- // lap_vsigma[nrxx + ir] = vs.at(ir * 3 + 1);
243- // lap_vsigma[2 * nrxx + ir] = vs.at(ir * 3 + 2) * 2.0;
244- // }
245- // for (int isig = 0;isig < 3;++isig)
246- // LR_Util::lapl(lap_vsigma.data() + isig * nrxx, lap_vsigma.data() + isig * nrxx, *(chg_gs.rhopw), tpiba2);
247- // // add to v2rho2_
248- // BlasConnector::axpy(3 * nrxx, 1.0, v2r2.data(), 1, to_mul_rho_.data(), 1);
249- // BlasConnector::axpy(3 * nrxx, -1.0, div_v2rhosigma_gdrho_r.data(), 1, to_mul_rho_.data(), 1);
250- // BlasConnector::axpy(3 * nrxx, 1.0, v2sigma2_sigma_r.data(), 1, to_mul_rho_.data(), 1);
251- // BlasConnector::axpy(3 * nrxx, 1.0, lap_vsigma.data(), 1, to_mul_rho_.data(), 1);
252- // }
253- else
253+
254+ // 2. $4f^{\sigma\sigma}*\nabla\rho$
255+ this ->v2sigma2_4drho_ .resize (nrxx);
256+ #ifdef _OPENMP
257+ #pragma omp parallel for schedule(static, 4096)
258+ #endif
259+ for (size_t i = 0 ; i < nrxx; ++i)
254260 {
255- throw std::domain_error (" nspin =" + std::to_string (nspin)
256- + " unfinished in " + std::string (__FILE__) + " line " + std::to_string (__LINE__));
261+ this ->v2sigma2_4drho_ [i] = gradrho[0 ][i] * v2s2[i] * 4 .;
257262 }
258263 }
259- } // end for( xc_func_type &func : funcs )
260- XC_Functional_Libxc::finish_func (funcs);
261-
262- if (1 == PARAM.inp .nspin || 2 == PARAM.inp .nspin ) return ;
264+ else
265+ {
266+ throw std::domain_error (" nspin =" + std::to_string (nspin)
267+ + " unfinished in " + std::string (__FILE__) + " line " + std::to_string (__LINE__));
268+ }
269+ }
270+ if (PARAM.inp .nspin == 1 || PARAM.inp .nspin == 2 ) {
271+ return ;
263272 // else if (4 == PARAM.inp.nspin)
264- else // NSPIN != 1,2,4 is not supported
273+ } else // NSPIN != 1,2,4 is not supported
265274 {
266275 throw std::domain_error (" PARAM.inp.nspin =" + std::to_string (PARAM.inp .nspin )
267276 + " unfinished in " + std::string (__FILE__) + " line " + std::to_string (__LINE__));
0 commit comments