deepmodeling
diff --git a/‎Dockerfile.intel‎
Lines changed: 3 additions & 0 deletions b/‎Dockerfile.intel‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/index.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/index.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎source/module_base/kernels/cuda/math_kernel_op.cu‎
Lines changed: 27 additions & 15 deletions b/‎source/module_base/kernels/cuda/math_kernel_op.cu‎
Lines changed: 27 additions & 15 deletions
diff --git a/‎source/module_base/kernels/math_kernel_op.cpp‎
Lines changed: 16 additions & 3 deletions b/‎source/module_base/kernels/math_kernel_op.cpp‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎source/module_base/kernels/math_kernel_op.h‎
Lines changed: 3 additions & 4 deletions b/‎source/module_base/kernels/math_kernel_op.h‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎source/module_base/kernels/rocm/math_kernel_op.hip.cu‎
Lines changed: 24 additions & 11 deletions b/‎source/module_base/kernels/rocm/math_kernel_op.hip.cu‎
Lines changed: 24 additions & 11 deletions
diff --git a/‎source/module_base/module_device/rocm/memory_op.hip.cu‎
Lines changed: 2 additions & 6 deletions b/‎source/module_base/module_device/rocm/memory_op.hip.cu‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎source/module_esolver/esolver_ks_pw.cpp‎
Lines changed: 1 addition & 1 deletion b/‎source/module_esolver/esolver_ks_pw.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp‎
Lines changed: 3 additions & 3 deletions b/‎source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp‎
Lines changed: 3 additions & 3 deletions
@@ -44,6 +44,9 @@ RUN source /opt/intel/oneapi/setvars.sh && \
     ln -s /usr/local/include/elpa_openmp-$ELPA_VER/elpa /usr/local/include/ && \
     cd /tmp && rm -rf elpa-$ELPA_VER
 
+RUN cd /tmp && git clone https://github.com/Tencent/rapidjson.git && cp -r rapidjson/include/rapidjson /usr/include/ \
+    && rm -rf rapidjson
+
 RUN wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcpu.zip \
         --no-check-certificate --quiet -O libtorch.zip && \
     unzip -q libtorch.zip -d /opt  && rm libtorch.zip
 
@@ -7,7 +7,7 @@
 ABACUS Documentation
 =================================================
 
-ABACUS (**A**tomic-orbital **B**ased **A**b-initio **C**omputation at **US**tc) is
+ABACUS (Atomic-orbital Based Ab-initio Computation at UStc) is
 an open-source computer code package based on density functional
 theory (DFT). The package utilizes both plane wave and numerical
 atomic basis sets with the usage of pseudopotentials
 
@@ -325,16 +325,23 @@ __global__ void vector_div_constant_kernel(
 }
 
 template <typename T>
-__global__ void vector_mul_vector_kernel(
-    const int size,
-    T* result,
-    const T* vector1,
-    const typename GetTypeReal<T>::type* vector2)
+__global__ void vector_mul_vector_kernel(const int size,
+                                         T* result,
+                                         const T* vector1,
+                                         const typename GetTypeReal<T>::type* vector2,
+                                         const bool add)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < size)
     {
-        result[i] = vector1[i] * vector2[i];
+        if (add)
+        {
+            result[i] += vector1[i] * vector2[i];
+        }
+        else
+        {
+            result[i] = vector1[i] * vector2[i];
+        }
     }
 }
 
@@ -548,11 +555,12 @@ template <>
 void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
                                                                        double* result,
                                                                        const double* vector1,
-                                                                       const double* vector2)
+                                                                       const double* vector2,
+                                                                       const bool& add)
 {
     int thread = thread_per_block;
     int block = (dim + thread - 1) / thread;
-    vector_mul_vector_kernel<double> <<<block, thread >>> (dim, result, vector1, vector2);
+    vector_mul_vector_kernel<double> <<<block, thread >>> (dim, result, vector1, vector2, add);
 
     cudaCheckOnDebug();
 }
@@ -561,32 +569,35 @@ template <typename FPTYPE>
 inline void vector_mul_vector_complex_wrapper(const int& dim,
                                               std::complex<FPTYPE>* result,
                                               const std::complex<FPTYPE>* vector1,
-                                              const FPTYPE* vector2)
+                                              const FPTYPE* vector2,
+                                              const bool& add)
 {
     thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
     const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
     int thread = thread_per_block;
     int block = (dim + thread - 1) / thread;
-    vector_mul_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2);
+    vector_mul_vector_kernel<thrust::complex<FPTYPE>> <<<block, thread >>> (dim, result_tmp, vector1_tmp, vector2, add);
 
     cudaCheckOnDebug();
 }
 template <>
 void vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
                                                                                     std::complex<float>* result,
                                                                                     const std::complex<float>* vector1,
-                                                                                    const float* vector2)
+                                                                                    const float* vector2,
+                                                                                    const bool& add)
 {
-    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
+    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
 }
 template <>
 void vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
     const int& dim,
     std::complex<double>* result,
     const std::complex<double>* vector1,
-    const double* vector2)
+    const double* vector2,
+    const bool& add)
 {
-    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
+    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
 }
 
 // vector operator: result[i] = vector1[i](not complex) / vector2[i](not complex)
@@ -1019,6 +1030,7 @@ template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>;
@@ -1029,6 +1041,7 @@ template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
@@ -1039,7 +1052,6 @@ template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
 #ifdef __LCAO
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
-template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
 #endif
 }  // namespace ModuleBase
@@ -167,14 +167,27 @@ template <typename T>
 struct vector_mul_vector_op<T, base_device::DEVICE_CPU>
 {
     using Real = typename GetTypeReal<T>::type;
-    void operator()(const int& dim, T* result, const T* vector1, const Real* vector2)
+    void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add)
     {
+        if (add)
+        {
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static, 4096 / sizeof(Real))
 #endif
-        for (int i = 0; i < dim; i++)
+            for (int i = 0; i < dim; i++)
+            {
+                result[i] += vector1[i] * vector2[i];
+            }
+        }
+        else
         {
-            result[i] = vector1[i] * vector2[i];
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 4096 / sizeof(Real))
+#endif
+            for (int i = 0; i < dim; i++)
+            {
+                result[i] = vector1[i] * vector2[i];
+            }
         }
     }
 };
 
@@ -143,11 +143,11 @@ template <typename T, typename Device> struct vector_mul_vector_op {
   /// \param dim : array size
   /// \param vector1 : input array A
   /// \param vector2 : input array B
+  /// \param add : flag to control whether to add the result to the output array
   ///
   /// Output Parameters
   /// \param result : output array
-  void operator()(const int &dim, T *result, const T *vector1,
-                  const Real *vector2);
+  void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
 };
 
 // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
@@ -350,8 +350,7 @@ struct vector_div_constant_op<T, base_device::DEVICE_GPU> {
 // vector operator: result[i] = vector1[i](complex) * vector2[i](not complex)
 template <typename T> struct vector_mul_vector_op<T, base_device::DEVICE_GPU> {
   using Real = typename GetTypeReal<T>::type;
-  void operator()(const int &dim, T *result,
-                  const T *vector1, const Real *vector2);
+  void operator()(const int& dim, T* result, const T* vector1, const Real* vector2, const bool& add = false);
 };
 
 // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
 
@@ -248,12 +248,20 @@ __global__ void vector_mul_vector_kernel(
     const int size,
     T* result,
     const T* vector1,
-    const typename GetTypeReal<T>::type* vector2)
+    const typename GetTypeReal<T>::type* vector2,
+    const bool add)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < size)
     {
-        result[i] = vector1[i] * vector2[i];
+        if (add)
+        {
+            result[i] += vector1[i] * vector2[i];
+        }
+        else
+        {
+            result[i] = vector1[i] * vector2[i];
+        }
     }
 }
 
@@ -471,11 +479,12 @@ template <>
 void vector_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
                                                                        double* result,
                                                                        const double* vector1,
-                                                                       const double* vector2)
+                                                                       const double* vector2,
+                                                                       const bool& add)
 {
     int thread = 1024;
     int block = (dim + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<double>), dim3(block), dim3(thread), 0, 0, dim, result, vector1, vector2);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<double>), dim3(block), dim3(thread), 0, 0, dim, result, vector1, vector2, add);
 
     hipCheckOnDebug();
 }
@@ -485,32 +494,35 @@ template <typename FPTYPE>
 inline void vector_mul_vector_complex_wrapper(const int& dim,
                                               std::complex<FPTYPE>* result,
                                               const std::complex<FPTYPE>* vector1,
-                                              const FPTYPE* vector2)
+                                              const FPTYPE* vector2,
+                                              const bool& add)
 {
     thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
     const thrust::complex<FPTYPE>* vector1_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector1);
     int thread = 1024;
     int block = (dim + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<thrust::complex<FPTYPE>>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, vector2);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_mul_vector_kernel<thrust::complex<FPTYPE>>), dim3(block), dim3(thread), 0, 0, dim, result_tmp, vector1_tmp, vector2, add);
 
     hipCheckOnDebug();
 }
 template <>
 void vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
                                                                                     std::complex<float>* result,
                                                                                     const std::complex<float>* vector1,
-                                                                                    const float* vector2)
+                                                                                    const float* vector2,
+                                                                                    const bool& add)
 {
-    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
+    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
 }
 template <>
 void vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(
     const int& dim,
     std::complex<double>* result,
     const std::complex<double>* vector1,
-    const double* vector2)
+    const double* vector2,
+    const bool& add)
 {
-    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2);
+    vector_mul_vector_complex_wrapper(dim, result, vector1, vector2, add);
 }
 // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
 template <>
@@ -931,6 +943,7 @@ template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<float, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>;
@@ -940,6 +953,7 @@ template struct dot_real_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
@@ -948,7 +962,6 @@ template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
 #ifdef __LCAO
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
-template struct vector_mul_vector_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
 template struct matrixCopy<double, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
 
@@ -108,9 +108,7 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_devic
         // No need to cast the memory if the data types are the same.
         if (std::is_same<FPTYPE_out, FPTYPE_in>::value)
         {
-            synchronize_memory_op<FPTYPE_out, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(dev_out,
-                                                                                                  dev_in,
-                                                                                                  arr_out,
+            synchronize_memory_op<FPTYPE_out, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(arr_out,
                                                                                                   reinterpret_cast<const FPTYPE_out*>(arr_in),
                                                                                                   size);
             return;
@@ -135,9 +133,7 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_devic
         // No need to cast the memory if the data types are the same.
         if (std::is_same<FPTYPE_out, FPTYPE_in>::value)
         {
-            synchronize_memory_op<FPTYPE_out, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(dev_out,
-                                                                                                  dev_in,
-                                                                                                  arr_out,
+            synchronize_memory_op<FPTYPE_out, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(arr_out,
                                                                                                   reinterpret_cast<const FPTYPE_out*>(arr_in),
                                                                                                   size);
             return;
 
@@ -943,7 +943,7 @@ void ESolver_KS_PW<T, Device>::after_all_runners(UnitCell& ucell)
     //! 7) Use Kubo-Greenwood method to compute conductivities
     if (PARAM.inp.cal_cond)
     {
-        EleCond elec_cond(&ucell, &this->kv, this->pelec, this->pw_wfc, this->psi, &this->ppcell);
+        EleCond<Real, Device> elec_cond(&ucell, &this->kv, this->pelec, this->pw_wfc, this->kspw_psi, &this->ppcell);
         elec_cond.KG(PARAM.inp.cond_smear,
                      PARAM.inp.cond_fwhm,
                      PARAM.inp.cond_wcut,
 
@@ -26,9 +26,9 @@ void sparse_format::cal_dH(const UnitCell& ucell,
     fsr_dh.DHloc_fixedR_y = new double[nnr];
     fsr_dh.DHloc_fixedR_z = new double[nnr];
 
-    ModuleBase::GlobalFunc::ZEROS(fsr_dh.DHloc_fixedR_x, pv.nloc);
-    ModuleBase::GlobalFunc::ZEROS(fsr_dh.DHloc_fixedR_y, pv.nloc);
-    ModuleBase::GlobalFunc::ZEROS(fsr_dh.DHloc_fixedR_z, pv.nloc);
+    ModuleBase::GlobalFunc::ZEROS(fsr_dh.DHloc_fixedR_x, nnr);
+    ModuleBase::GlobalFunc::ZEROS(fsr_dh.DHloc_fixedR_y, nnr);
+    ModuleBase::GlobalFunc::ZEROS(fsr_dh.DHloc_fixedR_z, nnr);
     // cal dT=<phi|kin|dphi> in LCAO
     // cal T + VNL(P1) in LCAO basis
     const bool cal_deri = true;
Original file line number	Diff line number	Diff line change
`@@ -943,7 +943,7 @@ void ESolver_KS_PW<T, Device>::after_all_runners(UnitCell& ucell)`
`943`	`943`	`//! 7) Use Kubo-Greenwood method to compute conductivities`
`944`	`944`	`if (PARAM.inp.cal_cond)`
`945`	`945`	`{`
`946`		`- EleCond elec_cond(&ucell, &this->kv, this->pelec, this->pw_wfc, this->psi, &this->ppcell);`
	`946`	`+ EleCond<Real, Device> elec_cond(&ucell, &this->kv, this->pelec, this->pw_wfc, this->kspw_psi, &this->ppcell);`
`947`	`947`	`elec_cond.KG(PARAM.inp.cond_smear,`
`948`	`948`	`PARAM.inp.cond_fwhm,`
`949`	`949`	`PARAM.inp.cond_wcut,`