deepmodeling · mohanchen · Mar 18, 2025 · Mar 13, 2025 · Mar 15, 2025 · Mar 15, 2025
diff --git a/source/module_base/kernels/cuda/math_kernel_op.cu b/source/module_base/kernels/cuda/math_kernel_op.cu
@@ -325,16 +325,23 @@ __global__ void vector_div_constant_kernel(
 }
 
 template <typename T>
-__global__ void vector_mul_vector_kernel(
-    const int size,
-    T* result,
-    const T* vector1,
-    const typename GetTypeReal<T>::type* vector2)
+__global__ void vector_mul_vector_kernel(const int size,
+                                         T* result,
+                                         const T* vector1,
+                                         const typename GetTypeReal<T>::type* vector2,
+                                         const bool add)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < size)
     {
-        result[i] = vector1[i] * vector2[i];
+        if (add)
+        {
+            result[i] += vector1[i] * vector2[i];
+        }
+        else
+        {
+            result[i] = vector1[i] * vector2[i];
+        }
     }
 }
 
@@ -548,11 +555,12 @@ template <>
 void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
                                                                        double* result,
                                                                        const double* vector1,
-                                                                       const double* vector2)
+                                                                       const double* vector2,
+                                                                       const bool& add)
 {
     int thread = thread_per_block;
     int block = (dim + thread - 1) / thread;
-    vector_mul_vector_kernel<double> <<<block, thread >>> (dim, result, vector1, vector2);
+    vector_mul_vector_kernel<double> <<<block, thread >>> (dim, result, vector1, vector2, add);
 
     cudaCheckOnDebug();
 }
@@ -561,32 +569,35 @@ template <typename FPTYPE>
 inline void vector_mul_vector_complex_wrapper(const int& dim,
                                               std::complex<FPTYPE>* result,
                                               const std::complex<FPTYPE>* vector1,
-                                              const FPTYPE* vector2)
+                                              const FPTYPE* vector2,
+                                              const bool& add)
 {
     thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
     const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
     int thread = thread_per_block;
     int block = (dim + thread - 1) / thread;
-    vector_mul_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2);
+    vector_mul_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2, add);
 
     cudaCheckOnDebug();
 }
 template <>
 void vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
                                                                                     std::complex<float>* result,
                                                                                     const std::complex<float>* vector1,
-                                                                                    const float* vector2)
+                                                                                    const float* vector2,
+                                                                                    const bool& add)
 {
-    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
+    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
 }
 template <>
 void vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
     const int& dim,
     std::complex<double>* result,
     const std::complex<double>* vector1,
-    const double* vector2)
+    const double* vector2,
+    const bool& add)
 {
-    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
+    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
 }
 
 // vector operator: result[i] = vector1[i](not complex) / vector2[i](not complex)
@@ -1019,6 +1030,7 @@ template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
@@ -1029,6 +1041,7 @@ template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
@@ -1039,7 +1052,6 @@ template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
 #ifdef __LCAO
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
-template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
 #endif
 }  // namespace ModuleBase
diff --git a/source/module_base/kernels/math_kernel_op.cpp b/source/module_base/kernels/math_kernel_op.cpp
@@ -167,14 +167,27 @@ template <typename T>
 struct vector_mul_vector_op<T, base_device::DEVICE_CPU>
 {
     using Real = typename GetTypeReal<T>::type;
-    void operator()(const int& dim, T* result, const T* vector1, const Real* vector2)
+    void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add)
     {
+        if (add)
+        {
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static, 4096 / sizeof(Real))
 #endif
-        for (int i = 0; i < dim; i++)
+            for (int i = 0; i < dim; i++)
+            {
+                result[i] += vector1[i] * vector2[i];
+            }
+        }
+        else
         {
-            result[i] = vector1[i] * vector2[i];
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
+#endif
+            for (int i = 0; i < dim; i++)
+            {
+                result[i] = vector1[i] * vector2[i];
+            }
         }
     }
 };

diff --git a/source/module_base/kernels/math_kernel_op.h b/source/module_base/kernels/math_kernel_op.h
@@ -143,11 +143,11 @@ template <typename T, typename Device> struct vector_mul_vector_op {
   /// \param dim : array size
   /// \param vector1 : input array A
   /// \param vector2 : input array B
+  /// \param add : flag to control whether to add the result to the output array
   ///
   /// Output Parameters
   /// \param result : output array
-  void operator()(const int &dim, T *result, const T *vector1,
-                  const Real *vector2);
+  void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
 };
 
 // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
@@ -350,8 +350,7 @@ struct vector_div_constant_op<T, base_device::DEVICE_GPU> {
 // vector operator: result[i] = vector1[i](complex) * vector2[i](not complex)
 template <typename T> struct vector_mul_vector_op<T, base_device::DEVICE_GPU> {
   using Real = typename GetTypeReal<T>::type;
-  void operator()(const int &dim, T *result,
-                  const T *vector1, const Real *vector2);
+  void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
 };
 
 // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)

diff --git a/source/module_base/kernels/rocm/math_kernel_op.hip.cu b/source/module_base/kernels/rocm/math_kernel_op.hip.cu
@@ -248,12 +248,20 @@ __global__ void vector_mul_vector_kernel(
     const int size,
     T* result,
     const T* vector1,
-    const typename GetTypeReal<T>::type* vector2)
+    const typename GetTypeReal<T>::type* vector2,
+    const bool add)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < size)
     {
-        result[i] = vector1[i] * vector2[i];
+        if (add)
+        {
+            result[i] += vector1[i] * vector2[i];
+        }
+        else
+        {
+            result[i] = vector1[i] * vector2[i];
+        }
     }
 }
 
@@ -471,11 +479,12 @@ template <>
 void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
                                                                        double* result,
                                                                        const double* vector1,
-                                                                       const double* vector2)
+                                                                       const double* vector2,
+                                                                       const bool& add)
 {
     int thread = 1024;
     int block = (dim + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<double>), dim3(block), dim3(thread), 0, 0, dim, result, vector1, vector2);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<double>), dim3(block), dim3(thread), 0, 0, dim, result, vector1, vector2, add);
 
     hipCheckOnDebug();
 }
@@ -485,32 +494,35 @@ template <typename FPTYPE>
 inline void vector_mul_vector_complex_wrapper(const int& dim,
                                               std::complex<FPTYPE>* result,
                                               const std::complex<FPTYPE>* vector1,
-                                              const FPTYPE* vector2)
+                                              const FPTYPE* vector2,
+                                              const bool& add)
 {
     thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
     const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
     int thread = 1024;
     int block = (dim + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<thrust::complex<FPTYPE>>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, vector2);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<thrust::complex<FPTYPE>>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, vector2, add);
 
     hipCheckOnDebug();
 }
 template <>
 void vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
                                                                                     std::complex<float>* result,
                                                                                     const std::complex<float>* vector1,
-                                                                                    const float* vector2)
+                                                                                    const float* vector2,
+                                                                                    const bool& add)
 {
-    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
+    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
 }
 template <>
 void vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
     const int& dim,
     std::complex<double>* result,
     const std::complex<double>* vector1,
-    const double* vector2)
+    const double* vector2,
+    const bool& add)
 {
-    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
+    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
 }
 // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
 template <>
@@ -931,6 +943,7 @@ template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
@@ -940,6 +953,7 @@ template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
@@ -948,7 +962,6 @@ template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
 #ifdef __LCAO
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
-template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
 template struct matrixCopy<double, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;

diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp
@@ -943,7 +943,7 @@ void ESolver_KS_PW<T, Device>::after_all_runners(UnitCell& ucell)
     //! 7) Use Kubo-Greenwood method to compute conductivities
     if (PARAM.inp.cal_cond)
     {
-        EleCond elec_cond(&ucell, &this->kv, this->pelec, this->pw_wfc, this->psi, &this->ppcell);
+        EleCond<Real, Device> elec_cond(&ucell, &this->kv, this->pelec, this->pw_wfc, this->kspw_psi, &this->ppcell);
         elec_cond.KG(PARAM.inp.cond_smear,
                      PARAM.inp.cond_fwhm,
                      PARAM.inp.cond_wcut,