diff --git a/source/module_lr/potentials/xc_kernel.cpp b/source/module_lr/potentials/xc_kernel.cpp
index f5fa4f8fc8..7f7275ac54 100644
--- a/source/module_lr/potentials/xc_kernel.cpp
+++ b/source/module_lr/potentials/xc_kernel.cpp
@@ -5,11 +5,15 @@
 #include "module_lr/utils/lr_util.h"
 #include "module_lr/utils/lr_util_xc.hpp"
 #include <set>
+#include <chrono>
 #include "module_io/cube_io.h"
 #ifdef USE_LIBXC
 #include <xc.h>
 #include "module_hamilt_general/module_xc/xc_functional_libxc.h"
 #endif
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 
 LR::KernelXC::KernelXC(const ModulePW::PW_Basis& rho_basis,
     const UnitCell& ucell,
@@ -66,6 +70,29 @@ LR::KernelXC::KernelXC(const ModulePW::PW_Basis& rho_basis,
 #endif
 }
 
+template <typename T>
+inline void add_op(const T* const src1, const T* const src2, T* const dst, const int size)
+{
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 4096)
+#endif
+    for (int i = 0;i < size;++i)
+    {
+        dst[i] = src1[i] + src2[i];
+    }
+}
+template <typename T>
+inline void add_op(const std::vector<T>& src1, const std::vector<T>& src2, std::vector<T>& dst)
+{
+    assert(dst.size() >= src1.size() && src2.size() >= src1.size());
+    add_op(src1.data(), src2.data(), dst.data(), src1.size());
+}
+template <typename T>
+inline void add_assign_op(const std::vector<T>& src, std::vector<T>& dst)
+{
+    add_op(src, dst, dst);
+}
+
 #ifdef USE_LIBXC
 void LR::KernelXC::f_xc_libxc(const int& nspin, const double& omega, const double& tpiba, const double* const* const rho_gs, const double* const rho_core)
 {
@@ -115,7 +142,8 @@ void LR::KernelXC::f_xc_libxc(const int& nspin, const double& omega, const doubl
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static, 1024)
 #endif
-            for (int ir = 0; ir < nrxx; ++ir) rhor[ir] = rho[ir * nspin + is];
+            for (int ir = 0; ir < nrxx; ++ir) { rhor[ir] = rho[ir * nspin + is];
+}
             gradrho[is].resize(nrxx);
             LR_Util::grad(rhor.data(), gradrho[is].data(), rho_basis_, tpiba);
         }
@@ -126,8 +154,9 @@ void LR::KernelXC::f_xc_libxc(const int& nspin, const double& omega, const doubl
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static, 1024)
 #endif
-            for (int ir = 0; ir < nrxx; ++ir)
+            for (int ir = 0; ir < nrxx; ++ir) {
                 sigma[ir] = gradrho[0][ir] * gradrho[0][ir];
+}
         }
         else
         {
@@ -144,13 +173,13 @@ void LR::KernelXC::f_xc_libxc(const int& nspin, const double& omega, const doubl
     }
     // -----------------------------------------------------------------------------------
     //==================== XC Kernels (f_xc)=============================
-    this->vrho_.resize(nspin * nrxx);
-    this->v2rho2_.resize(((1 == nspin) ? 1 : 3) * nrxx);//(nrxx* ((1 == nspin) ? 1 : 3)): 00, 01, 11
+    this->vrho_.resize(nspin * nrxx, 0.);
+    this->v2rho2_.resize(((1 == nspin) ? 1 : 3) * nrxx, 0.);//(nrxx* ((1 == nspin) ? 1 : 3)): 00, 01, 11
     if (is_gga)
     {
-        this->vsigma_.resize(((1 == nspin) ? 1 : 3) * nrxx);//(nrxx*): 2 for rho * 3 for sigma: 00, 01, 02, 10, 11, 12
-        this->v2rhosigma_.resize(((1 == nspin) ? 1 : 6) * nrxx); //(nrxx*): 2 for rho * 3 for sigma: 00, 01, 02, 10, 11, 12
-        this->v2sigma2_.resize(((1 == nspin) ? 1 : 6) * nrxx);   //(nrxx* ((1 == nspin) ? 1 : 6)): 00, 01, 02, 11, 12, 22
+        this->vsigma_.resize(((1 == nspin) ? 1 : 3) * nrxx, 0.);//(nrxx*): 2 for rho * 3 for sigma: 00, 01, 02, 10, 11, 12
+        this->v2rhosigma_.resize(((1 == nspin) ? 1 : 6) * nrxx, 0.); //(nrxx*): 2 for rho * 3 for sigma: 00, 01, 02, 10, 11, 12
+        this->v2sigma2_.resize(((1 == nspin) ? 1 : 6) * nrxx, 0.);   //(nrxx* ((1 == nspin) ? 1 : 6)): 00, 01, 02, 11, 12, 22
     }
     //MetaGGA ...
 
@@ -163,105 +192,85 @@ void LR::KernelXC::f_xc_libxc(const int& nspin, const double& omega, const doubl
 
         //cut off grho if not LDA (future subfunc)
 
+        // Libxc interfaces overwrite (instead of add onto) the output arrays, so we need temporary copies
+        std::vector<double> vrho_tmp(this->vrho_.size());
+        std::vector<double> v2rho2_tmp(this->v2rho2_.size());
+        std::vector<double> vsigma_tmp(this->vsigma_.size());
+        std::vector<double> v2rhosigma_tmp(this->v2rhosigma_.size());
+        std::vector<double> v2sigma2_tmp(this->v2sigma2_.size());
         switch (func.info->family)
         {
         case XC_FAMILY_LDA:
-            xc_lda_vxc(&func, nrxx, rho.data(), vrho_.data());
-            xc_lda_fxc(&func, nrxx, rho.data(), v2rho2_.data());
+            xc_lda_vxc(&func, nrxx, rho.data(), vrho_tmp.data());
+            xc_lda_fxc(&func, nrxx, rho.data(), v2rho2_tmp.data());
             break;
         case XC_FAMILY_GGA:
         case XC_FAMILY_HYB_GGA:
-            xc_gga_vxc(&func, nrxx, rho.data(), sigma.data(), vrho_.data(), vsigma_.data());
-            xc_gga_fxc(&func, nrxx, rho.data(), sigma.data(), v2rho2_.data(), v2rhosigma_.data(), v2sigma2_.data());
+            xc_gga_vxc(&func, nrxx, rho.data(), sigma.data(), vrho_tmp.data(), vsigma_tmp.data());
+            xc_gga_fxc(&func, nrxx, rho.data(), sigma.data(), v2rho2_tmp.data(), v2rhosigma_tmp.data(), v2sigma2_tmp.data());
             break;
         default:
             throw std::domain_error("func.info->family =" + std::to_string(func.info->family)
                 + " unfinished in " + std::string(__FILE__) + " line " + std::to_string(__LINE__));
             break;
         }
-        // some formulas for GGA
-        if (func.info->family == XC_FAMILY_GGA || func.info->family == XC_FAMILY_HYB_GGA)
-        {
-            const std::vector<double>& v2r2 = this->v2rho2_;
-            const std::vector<double>& v2rs = this->v2rhosigma_;
-            const std::vector<double>& v2s2 = this->v2sigma2_;
-            const std::vector<double>& vs = this->vsigma_;
-            const double tpiba2 = tpiba * tpiba;
+        // add onto the total components
+        // auto start = std::chrono::high_resolution_clock::now();
+        add_assign_op(vrho_tmp, this->vrho_);
+        add_assign_op(v2rho2_tmp, this->v2rho2_);
+        add_assign_op(vsigma_tmp, this->vsigma_);
+        add_assign_op(v2rhosigma_tmp, this->v2rhosigma_);
+        add_assign_op(v2sigma2_tmp, this->v2sigma2_);
+        // auto end = std::chrono::high_resolution_clock::now();
+        // auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+        // std::cout << "Time elapsed adding XC components: " << duration.count() << " ms\n";
+    } // end for( xc_func_type &func : funcs )
+
+    XC_Functional_Libxc::finish_func(funcs);
 
-            if (1 == nspin)
+    // some postprocess if there're GGA funcs in the list
+    if (is_gga)
+    {
+        const std::vector<double>& v2r2 = this->v2rho2_;
+        const std::vector<double>& v2rs = this->v2rhosigma_;
+        const std::vector<double>& v2s2 = this->v2sigma2_;
+        const std::vector<double>& vs = this->vsigma_;
+        const double tpiba2 = tpiba * tpiba;
+
+        if (nspin == 1)
+        {
+            // 0. drho
+            this->drho_gs_ = gradrho[0];
+            // 1. $2f^{\rho\sigma}*\nabla\rho$
+            this->v2rhosigma_2drho_.resize(nrxx);
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 4096)
+#endif
+            for (size_t i = 0; i < nrxx; ++i)
             {
-                using V3 = ModuleBase::Vector3<double>;
-                // 0. drho
-                this->drho_gs_ = gradrho[0];
-                // 1. $2f^{\rho\sigma}*\nabla\rho$
-                this->v2rhosigma_2drho_.resize(nrxx);
-                std::transform(gradrho[0].begin(), gradrho[0].end(), v2rs.begin(), this->v2rhosigma_2drho_.begin(),
-                    [](const V3& a, const V3& b) {return a * b * 2.; });
-                // 2. $4f^{\sigma\sigma}*\nabla\rho$
-                this->v2sigma2_4drho_.resize(nrxx);
-                std::transform(sigma.begin(), sigma.end(), v2s2.begin(), v2sigma2_4drho_.begin(),
-                    [](const V3& a, const V3& b) {return a * b * 4.; });
+                this->v2rhosigma_2drho_[i] = gradrho[0][i] * v2rs[i] * 2.;
             }
-            // else if (2 == nspin)    // wrong, to be fixed
-            // {
-            //     // 1. $\nabla\cdot(f^{\rho\sigma}*\nabla\rho)$
-            //     std::vector<double> div_v2rhosigma_gdrho_r(3 * nrxx);
-            //     std::vector<ModuleBase::Vector3<double>> v2rhosigma_gdrho_r(3 * nrxx);
-            //     for (int ir = 0; ir < nrxx; ++ir)
-            //     {
-            //         v2rhosigma_gdrho_r[ir] = gradrho[0][ir] * v2rs.at(ir * 6) * 4.0
-            //             + gradrho[1][ir] * v2rs.at(ir * 6 + 1) * 2.0;   //up-up
-            //         v2rhosigma_gdrho_r[nrxx + ir] = gradrho[0][ir] * (v2rs.at(ir * 6 + 3) * 2.0 + v2rs.at(ir * 6 + 1))
-            //             + gradrho[1][ir] * (v2rs.at(ir * 6 + 2) * 2.0 + v2rs.at(ir * 6 + 4));   //up-down
-            //         v2rhosigma_gdrho_r[2 * nrxx + ir] = gradrho[1][ir] * v2rs.at(ir * 6 + 5) * 4.0
-            //             + gradrho[0][ir] * v2rs.at(ir * 6 + 4) * 2.0;   //down-down
-            //     }
-            //     for (int isig = 0;isig < 3;++isig)
-            //         XC_Functional::grad_dot(v2rhosigma_gdrho_r.data() + isig * nrxx, div_v2rhosigma_gdrho_r.data() + isig * nrxx, chg_gs.rhopw, tpiba);
-            //     // 2. $\nabla^2(f^{\sigma\sigma}*\sigma)$
-            //     std::vector<double> v2sigma2_sigma_r(3 * nrxx);
-            //     for (int ir = 0; ir < nrxx; ++ir)
-            //     {
-            //         v2sigma2_sigma_r[ir] = v2s2.at(ir * 6) * sigma[ir * 3] * 4.0
-            //             + v2s2.at(ir * 6 + 1) * sigma[ir * 3 + 1] * 4.0
-            //             + v2s2.at(ir * 6 + 3) * sigma[ir * 3 + 2];   //up-up
-            //         v2sigma2_sigma_r[nrxx + ir] = v2s2.at(ir * 6 + 1) * sigma[ir * 3] * 2.0
-            //             + v2s2.at(ir * 6 + 4) * sigma[ir * 3 + 2] * 2.0
-            //             + (v2s2.at(ir * 6 + 2) * 4.0 + v2s2.at(ir * 6 + 3)) * sigma[ir * 3 + 1];   //up-down
-            //         v2sigma2_sigma_r[2 * nrxx + ir] = v2s2.at(ir * 6 + 5) * sigma[ir * 3 + 2] * 4.0
-            //             + v2s2.at(ir * 6 + 4) * sigma[ir * 3 + 1] * 4.0
-            //             + v2s2.at(ir * 6 + 3) * sigma[ir * 3];   //down-down
-            //     }
-            //     for (int isig = 0;isig < 3;++isig)
-            //         LR_Util::lapl(v2sigma2_sigma_r.data() + isig * nrxx, v2sigma2_sigma_r.data() + isig * nrxx, *(chg_gs.rhopw), tpiba2);
-            //     // 3. $\nabla^2(v^\sigma)$
-            //     std::vector<double> lap_vsigma(3 * nrxx);
-            //     for (int ir = 0;ir < nrxx;++ir)
-            //     {
-            //         lap_vsigma[ir] = vs.at(ir * 3) * 2.0;
-            //         lap_vsigma[nrxx + ir] = vs.at(ir * 3 + 1);
-            //         lap_vsigma[2 * nrxx + ir] = vs.at(ir * 3 + 2) * 2.0;
-            //     }
-            //     for (int isig = 0;isig < 3;++isig)
-            //         LR_Util::lapl(lap_vsigma.data() + isig * nrxx, lap_vsigma.data() + isig * nrxx, *(chg_gs.rhopw), tpiba2);
-            //     // add to v2rho2_
-            //     BlasConnector::axpy(3 * nrxx, 1.0, v2r2.data(), 1, to_mul_rho_.data(), 1);
-            //     BlasConnector::axpy(3 * nrxx, -1.0, div_v2rhosigma_gdrho_r.data(), 1, to_mul_rho_.data(), 1);
-            //     BlasConnector::axpy(3 * nrxx, 1.0, v2sigma2_sigma_r.data(), 1, to_mul_rho_.data(), 1);
-            //     BlasConnector::axpy(3 * nrxx, 1.0, lap_vsigma.data(), 1, to_mul_rho_.data(), 1);
-            // }
-            else
+
+            // 2. $4f^{\sigma\sigma}*\nabla\rho$
+            this->v2sigma2_4drho_.resize(nrxx);
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 4096)
+#endif
+            for (size_t i = 0; i < nrxx; ++i)
             {
-                throw std::domain_error("nspin =" + std::to_string(nspin)
-                    + " unfinished in " + std::string(__FILE__) + " line " + std::to_string(__LINE__));
+                this->v2sigma2_4drho_[i] = gradrho[0][i] * v2s2[i] * 4.;
             }
         }
-    } // end for( xc_func_type &func : funcs )
-    XC_Functional_Libxc::finish_func(funcs);
-
-    if (1 == PARAM.inp.nspin || 2 == PARAM.inp.nspin) return;
+        else
+        {
+            throw std::domain_error("nspin =" + std::to_string(nspin)
+                + " unfinished in " + std::string(__FILE__) + " line " + std::to_string(__LINE__));
+        }
+    }
+    if (PARAM.inp.nspin == 1 || PARAM.inp.nspin == 2) {
+        return;
     // else if (4 == PARAM.inp.nspin)
-    else//NSPIN != 1,2,4 is not supported
+    } else//NSPIN != 1,2,4 is not supported
     {
         throw std::domain_error("PARAM.inp.nspin =" + std::to_string(PARAM.inp.nspin)
             + " unfinished in " + std::string(__FILE__) + " line " + std::to_string(__LINE__));
diff --git a/tests/integrate/291_NO_KP_LR/result.ref b/tests/integrate/291_NO_KP_LR/result.ref
index 722b3fff61..b85ad819a5 100644
--- a/tests/integrate/291_NO_KP_LR/result.ref
+++ b/tests/integrate/291_NO_KP_LR/result.ref
@@ -1 +1 @@
-totexcitationenergyref 0.784274
\ No newline at end of file
+totexcitationenergyref 0.755653
\ No newline at end of file
diff --git a/tests/integrate/391_NO_GO_LR/result.ref b/tests/integrate/391_NO_GO_LR/result.ref
index 850f003f46..5c65fd6538 100644
--- a/tests/integrate/391_NO_GO_LR/result.ref
+++ b/tests/integrate/391_NO_GO_LR/result.ref
@@ -1 +1 @@
-totexcitationenergyref 2.641190
\ No newline at end of file
+totexcitationenergyref 2.510666
\ No newline at end of file
diff --git a/tests/integrate/393_NO_GO_ULR_HF/INPUT b/tests/integrate/393_NO_GO_ULR_HF/INPUT
index e9acbcda6d..c62c45b94e 100644
--- a/tests/integrate/393_NO_GO_ULR_HF/INPUT
+++ b/tests/integrate/393_NO_GO_ULR_HF/INPUT
@@ -27,12 +27,12 @@ mixing_type             pulay
 mixing_beta             0.4
 mixing_gg0 0
 
-dft_functional lda
+dft_functional hf
 exx_real_number 1
 exx_ccp_rmesh_times 1
 
 lr_nstates 3
-xc_kernel lda
+xc_kernel hf
 lr_solver lapack
 lr_thr 1e-2
 lr_unrestricted 1
diff --git a/tests/integrate/393_NO_GO_ULR_HF/result.ref b/tests/integrate/393_NO_GO_ULR_HF/result.ref
index e395c0fb9d..947ae2baba 100644
--- a/tests/integrate/393_NO_GO_ULR_HF/result.ref
+++ b/tests/integrate/393_NO_GO_ULR_HF/result.ref
@@ -1 +1 @@
-totexcitationenergyref 1.940703
\ No newline at end of file
+totexcitationenergyref 2.187535
\ No newline at end of file