abacusmodeling
diff --git a/‎source/module_base/include/math_multi_device.h‎
Lines changed: 17 additions & 0 deletions b/‎source/module_base/include/math_multi_device.h‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎source/module_elecstate/include/elecstate_multi_device.h‎
Lines changed: 25 additions & 1 deletion b/‎source/module_elecstate/include/elecstate_multi_device.h‎
Lines changed: 25 additions & 1 deletion
diff --git a/‎source/module_hamilt/include/ekinetic.h‎
Lines changed: 14 additions & 0 deletions b/‎source/module_hamilt/include/ekinetic.h‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎source/module_hamilt/include/nonlocal.h‎
Lines changed: 38 additions & 1 deletion b/‎source/module_hamilt/include/nonlocal.h‎
Lines changed: 38 additions & 1 deletion
diff --git a/‎source/module_hamilt/include/veff.h‎
Lines changed: 31 additions & 1 deletion b/‎source/module_hamilt/include/veff.h‎
Lines changed: 31 additions & 1 deletion
diff --git a/‎source/module_hsolver/include/math_kernel.h‎
Lines changed: 137 additions & 0 deletions b/‎source/module_hsolver/include/math_kernel.h‎
Lines changed: 137 additions & 0 deletions
@@ -8,6 +8,23 @@ namespace ModuleBase {
 
 template <typename FPTYPE, typename Device>
 struct cal_ylm_real_op {
+    /// @brief YLM_REAL::Real spherical harmonics ylm(G) up to l=lmax
+    /// Use Numerical recursive algorithm as given in Numerical Recipes
+    ///
+    /// Input Parameters
+    /// @param ctx - which device this function runs on
+    /// @param ng - number of problem size
+    /// @param lmax - determined by lmax2
+    /// @param SQRT2 - ModuleBase::SQRT2
+    /// @param PI - ModuleBase::PI
+    /// @param PI_HALF - ModuleBase::PI_HALF
+    /// @param FOUR_PI - ModuleBase::FOUR_PI,
+    /// @param SQRT_INVERSE_FOUR_PI - ModuleBase::SQRT_INVERSE_FOUR_PI,
+    /// @param g - input array with size npw * 3, GlobalC::wf.get_1qvec_cartesian
+    /// @param p - intermediate array
+    ///
+    /// Output Parameters
+    /// @param ylm - output array
     void operator() (
         const Device *ctx,
         const int &ng,
 
@@ -9,14 +9,38 @@ namespace elecstate{
 
 template <typename FPTYPE, typename Device> 
 struct elecstate_pw_op {
+  /// @brief Calculate psiToRho output within the band-by-band loop, NSPIN != 4
+  ///
+  /// Input Parameters
+  /// @param ctx - which device this function runs on
+  /// @param spin - current spin
+  /// @param nrxx - number of planewaves
+  /// @param weight - input constant
+  /// @param wfcr - input array, psi in real space
+  ///
+  /// Output Parameters
+  /// @param rho - electronic densities
   void operator() (
       const Device* ctx,
       const int& spin,
       const int& nrxx,
       const FPTYPE& weight,
       FPTYPE** rho,
       const std::complex<FPTYPE>* wfcr);
-  
+
+  /// @brief Calculate psiToRho output within the band-by-band loop, NSPIN == 4
+  ///
+  /// Input Parameters
+  /// @param ctx - which device this function runs on
+  /// @param DOMAG - GlobalV::DOMAG
+  /// @param DOMAG_Z - GlobalV::DOMAG_Z
+  /// @param nrxx - number of planewaves
+  /// @param weight - input constant
+  /// @param wfcr - input array, psi in real space
+  /// @param wfcr_another_spin - input array, psi in real space
+  ///
+  /// Output Parameters
+  /// @param rho - electronic densities
   void operator() (
       const Device* ctx,
       const bool& DOMAG,
 
@@ -7,6 +7,20 @@
 namespace hamilt {
 template <typename FPTYPE, typename Device> 
 struct ekinetic_pw_op {
+  /// @brief Compute the ekinetic potential of hPsi
+  ///
+  /// Input Parameters
+  /// \param dev : the type of computing device
+  /// \param nband : nbands
+  /// \param npw : number of planewaves of current k point
+  /// \param max_npw : max number of planewaves of all k points
+  /// \param tpiba2 : GlobalC::ucell.tpiba2
+  /// \param spin : current spin
+  /// \param gk2_ik : GlobalC::wfcpw->gk2
+  /// \param tmpsi_in : intermediate array
+  ///
+  /// Output Parameters
+  /// \param tmhpsi : output array
   void operator() (
       const Device* dev,
       const int& nband,
 
@@ -7,6 +7,25 @@
 namespace hamilt {
 template <typename FPTYPE, typename Device> 
 struct nonlocal_pw_op {
+  /// @brief Compute the nonlocal potential of hPsi
+  ///
+  /// Input Parameters
+  /// \param dev : the type of computing device
+  /// \param l1 : ucell->atoms[it].na
+  /// \param l2 : nbands
+  /// \param l3 : ucell->atoms[it].ncpp.nh
+  /// \param sum : intermediate value
+  /// \param iat : intermediate value
+  /// \param spin : current spin
+  /// \param nkb : ppcell->nkb, number of kpoints
+  /// \param deeq_x : second dimension of deeq
+  /// \param deeq_y : third dimension of deeq
+  /// \param deeq_z : forth dimension of deeq
+  /// \param deeq : ppcell->deeq
+  /// \param becp : intermediate array
+  ///
+  /// Output Parameters
+  /// \param ps : output array
   void operator() (
       const Device* dev,
       const int& l1,
@@ -22,7 +41,25 @@ struct nonlocal_pw_op {
       const FPTYPE* deeq,
       std::complex<FPTYPE>* ps,
       const std::complex<FPTYPE>* becp);
-  
+
+  /// @brief Compute the nonlocal potential of hPsi, with NSPIN > 2
+  ///
+  /// Input Parameters
+  /// \param dev : the type of computing device
+  /// \param l1 : ucell->atoms[it].na
+  /// \param l2 : nbands
+  /// \param l3 : ucell->atoms[it].ncpp.nh
+  /// \param sum : intermediate value
+  /// \param iat : intermediate value
+  /// \param nkb : ppcell->nkb, number of kpoints
+  /// \param deeq_x : second dimension of deeq
+  /// \param deeq_y : third dimension of deeq
+  /// \param deeq_z : forth dimension of deeq
+  /// \param deeq_nc : ppcell->deeq_nc
+  /// \param becp : intermediate array
+  ///
+  /// Output Parameters
+  /// \param ps : output array
   void operator() (
       const Device* dev,
       const int& l1,
 
@@ -7,12 +7,42 @@
 namespace hamilt {
 template <typename FPTYPE, typename Device>
 struct veff_pw_op {
+    /// @brief Compute the effective potential of hPsi in real space,
+    /// out[ir] *= in[ir];
+    ///
+    /// Input Parameters
+    /// \param dev : the type of computing device
+    /// \param size : array size
+    /// \param in : input array, elecstate::Potential::v_effective
+    ///
+    /// Output Parameters
+    /// \param out : output array
     void operator() (
         const Device* dev,
         const int& size,
         std::complex<FPTYPE>* out,
         const FPTYPE* in);
-    
+
+    /// @brief Compute the effective potential of hPsi in real space with NSPIN > 2,
+    ///
+    /// out[ir] = out[ir] * (in[0][ir] + in[3][ir])
+    ///       + out1[ir]
+    ///               * (in[1][ir]
+    ///               - std::complex<FPTYPE>(0.0, 1.0) * in[2][ir]);
+    ///
+    /// out1[ir] = out1[ir] * (in[0][ir] - in[3][ir])
+    ///         + out[ir]
+    ///             * (in[1][ir]
+    ///                 + std::complex<FPTYPE>(0.0, 1.0) * in[2][ir]);
+    ///
+    /// Input Parameters
+    /// \param dev : the type of computing device
+    /// \param size : array size
+    /// \param in : input array, elecstate::Potential::v_effective
+    ///
+    /// Output Parameters
+    /// \param out : output array 1
+    /// \param out1 : output array 2
     void operator() (
         const Device* dev,
         const int& size,
 
@@ -57,6 +57,18 @@ namespace hsolver
 
 template <typename FPTYPE, typename Device> 
 struct zdot_real_op {
+  /// @brief zdot_real_op computes the dot product of the given complex arrays(treated as float arrays).
+  /// And there's may have MPI communications while enabling planewave parallization strategy.
+  ///
+  /// Input Parameters
+  /// \param d : the type of computing device
+  /// \param dim : array size
+  /// \param psi_L : input array A
+  /// \param psi_R : input array B
+  /// \param reduce : flag to control whether to perform the MPI communications
+  ///
+  /// \return
+  /// FPTYPE : dot product result
   FPTYPE operator() (
       const Device* d,
       const int& dim,
@@ -68,6 +80,16 @@ struct zdot_real_op {
 // vector operator: result[i] = vector[i] / constant
 template <typename FPTYPE, typename Device> struct vector_div_constant_op
 {
+    /// @brief result[i] = vector[i] / constant
+    ///
+    /// Input Parameters
+    /// \param d : the type of computing device
+    /// \param dim : array size
+    /// \param vector : input array
+    /// \param constant : input constant
+    ///
+    /// Output Parameters
+    /// \param result : output array
     void operator()(const Device* d,
                     const int dim,
                     std::complex<FPTYPE>* result,
@@ -78,6 +100,17 @@ template <typename FPTYPE, typename Device> struct vector_div_constant_op
 // replace vector_div_constant_op : x = alpha * x
 template <typename FPTYPE, typename Device> struct scal_op
 {
+    /// @brief x = alpha * x
+    ///
+    /// Input Parameters
+    /// \param d : the type of computing device
+    /// \param N : array size
+    /// \param alpha : input constant
+    /// \param X : input array
+    /// \param incx : computing strip of array X
+    ///
+    /// Output Parameters
+    /// \param X : output array
     void operator()(const Device* d,
                     const int& N,
                     const std::complex<FPTYPE>* alpha,
@@ -88,6 +121,16 @@ template <typename FPTYPE, typename Device> struct scal_op
 // vector operator: result[i] = vector1[i](complex) * vector2[i](not complex)
 template <typename FPTYPE, typename Device> struct vector_mul_vector_op
 {
+    /// @brief result[i] = vector1[i](complex) * vector2[i](not complex)
+    ///
+    /// Input Parameters
+    /// \param d : the type of computing device
+    /// \param dim : array size
+    /// \param vector1 : input array A
+    /// \param vector2 : input array B
+    ///
+    /// Output Parameters
+    /// \param result : output array
     void operator()(const Device* d,
                     const int& dim,
                     std::complex<FPTYPE>* result,
@@ -98,6 +141,16 @@ template <typename FPTYPE, typename Device> struct vector_mul_vector_op
 // vector operator: result[i] = vector1[i](complex) / vector2[i](not complex)
 template <typename FPTYPE, typename Device> struct vector_div_vector_op
 {
+    /// @brief result[i] = vector1[i](complex) / vector2[i](not complex)
+    ///
+    /// Input Parameters
+    /// \param d : the type of computing device
+    /// \param dim : array size
+    /// \param vector1 : input array A
+    /// \param vector2 : input array B
+    ///
+    /// Output Parameters
+    /// \param result : output array
     void operator()(const Device* d,
                     const int& dim,
                     std::complex<FPTYPE>* result,
@@ -108,6 +161,18 @@ template <typename FPTYPE, typename Device> struct vector_div_vector_op
 // vector operator: result[i] = vector1[i] * constant1 + vector2[i] * constant2
 template <typename FPTYPE, typename Device> struct constantvector_addORsub_constantVector_op
 {
+    /// @brief result[i] = vector1[i] * constant1 + vector2[i] * constant2
+    ///
+    /// Input Parameters
+    /// \param d : the type of computing device
+    /// \param dim : array size
+    /// \param vector1 : input array A
+    /// \param constant1 : input constant a
+    /// \param vector2 : input array B
+    /// \param constant2 : input constant b
+    ///
+    /// Output Parameters
+    /// \param result : output array
     void operator()(const Device* d,
                     const int& dim,
                     std::complex<FPTYPE>* result,
@@ -120,6 +185,19 @@ template <typename FPTYPE, typename Device> struct constantvector_addORsub_const
 //  compute Y = alpha * X + Y
 template <typename FPTYPE, typename Device> struct axpy_op
 {
+    /// @brief Y = alpha * X + Y
+    ///
+    /// Input Parameters
+    /// \param d : the type of computing device
+    /// \param N : array size
+    /// \param alpha : input constant alpha
+    /// \param X : input array X
+    /// \param incX : computing strip of X
+    /// \param Y : computing strip of Y
+    /// \param incY : computing strip of Y
+    ///
+    /// Output Parameters
+    /// \param Y : output array Y
     void operator()(const Device* d,
                     const int& N,
                     const std::complex<FPTYPE>* alpha,
@@ -132,6 +210,24 @@ template <typename FPTYPE, typename Device> struct axpy_op
 // compute y = alpha * op(A) * x + beta * y
 template <typename FPTYPE, typename Device> struct gemv_op
 {
+    /// @brief y = alpha * op(A) * x + beta * y
+    ///
+    /// Input Parameters
+    /// \param d : the type of computing device
+    /// \param trans : whether to transpose A
+    /// \param m : first dimension of matrix
+    /// \param n : second dimension of matrix
+    /// \param alpha : input constant alpha
+    /// \param A : input matrix A
+    /// \param lda : leading dimention of A
+    /// \param X : input array X
+    /// \param incx : computing strip of X
+    /// \param beta : input constant beta
+    /// \param Y : input array Y
+    /// \param incy : computing strip of Y
+    ///
+    /// Output Parameters
+    /// \param Y : output array Y
     void operator()(const Device* d,
                     const char& trans,
                     const int& m,
@@ -150,6 +246,26 @@ template <typename FPTYPE, typename Device> struct gemv_op
 // compute C = alpha * op(A) * op(B) + beta * C
 template <typename FPTYPE, typename Device> struct gemm_op
 {
+    /// @brief C = alpha * op(A) * op(B) + beta * C
+    ///
+    /// Input Parameters
+    /// \param d : the type of computing device
+    /// \param transa : whether to transpose matrix A
+    /// \param transb : whether to transpose matrix B
+    /// \param m : first dimension of matrix mulplication
+    /// \param n : second dimension of matrix mulplication
+    /// \param k : third dimension of matrix mulplication
+    /// \param alpha : input constant alpha
+    /// \param a : input matrix A
+    /// \param lda : leading dimention of A
+    /// \param b : input matrix B
+    /// \param ldb : leading dimention of A
+    /// \param beta : input constant beta
+    /// \param c : input matrix C
+    /// \param ldc : leading dimention of C
+    ///
+    /// Output Parameters
+    /// \param c : output matrix C
     void operator()(const Device* d,
                     const char& transa, 
                     const char& transb, 
@@ -168,6 +284,16 @@ template <typename FPTYPE, typename Device> struct gemm_op
 
 template <typename FPTYPE, typename Device> struct matrixTranspose_op
 {
+    /// @brief transpose the input matrix
+    ///
+    /// Input Parameters
+    /// \param d : the type of computing device
+    /// \param row : first dimension of matrix
+    /// \param col : second dimension of matrix
+    /// \param input_matrix : input matrix
+    ///
+    /// Output Parameters
+    /// \param output_matrix : output matrix
     void operator()(const Device* d,
                     const int& row,
                     const int& col,
@@ -177,6 +303,17 @@ template <typename FPTYPE, typename Device> struct matrixTranspose_op
 
 template <typename FPTYPE, typename Device> struct matrixSetToAnother
 {
+    /// @brief initialize matrix B with A
+    ///
+    /// Input Parameters
+    /// \param d : the type of computing device
+    /// \param n : first dimension of matrix
+    /// \param A : input matrix A
+    /// \param LDA : leading dimension of A
+    /// \param LDB : leading dimension of B
+    ///
+    /// Output Parameters
+    /// \param B : output matrix B
     void operator()(const Device* d,
                     const int& n,
                     const std::complex<FPTYPE>* A,