deepmodeling
diff --git a/‎python/pyabacus/CONTRIBUTING.md‎
Lines changed: 6 additions & 3 deletions b/‎python/pyabacus/CONTRIBUTING.md‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎python/pyabacus/src/ModuleBase/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎python/pyabacus/src/ModuleBase/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/pyabacus/src/ModuleNAO/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎python/pyabacus/src/ModuleNAO/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/pyabacus/src/hsolver/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎python/pyabacus/src/hsolver/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/pyabacus/src/hsolver/py_hsolver.cpp‎
Lines changed: 1 addition & 1 deletion b/‎python/pyabacus/src/hsolver/py_hsolver.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎source/CMakeLists.txt‎
Lines changed: 3 additions & 3 deletions b/‎source/CMakeLists.txt‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎source/Makefile.Objects‎
Lines changed: 2 additions & 1 deletion b/‎source/Makefile.Objects‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎source/module_base/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎source/module_base/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎source/module_base/blas_connector.cpp‎
Lines changed: 7 additions & 7 deletions b/‎source/module_base/blas_connector.cpp‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎source/module_hsolver/kernels/cuda/math_kernel_op.cu‎ renamed to ‎source/module_base/kernels/cuda/math_kernel_op.cu‎
Lines changed: 25 additions & 4 deletions b/‎source/module_hsolver/kernels/cuda/math_kernel_op.cu‎ renamed to ‎source/module_base/kernels/cuda/math_kernel_op.cu‎
Lines changed: 25 additions & 4 deletions
@@ -8,10 +8,13 @@ Welcome to the `pyabacus` project! This document provides guidelines and instruc
 
 <!-- toc -->
 
-- [Project structure](#project-structure)
+- [Developer Guide](#developer-guide)
+  - [Introduction](#introduction)
+  - [Project Structure](#project-structure)
     - [Root CMake Configuration](#root-cmake-configuration)
     - [Module CMake Configuration](#module-cmake-configuration)
-- [Development Process](#development-process)
+  - [Development Process](#development-process)
+  - [Conclusion](#conclusion)
 
 <!-- tocstop -->
 
@@ -187,7 +190,7 @@ list(APPEND _diago
     ${HSOLVER_PATH}/diag_const_nums.cpp
     ${HSOLVER_PATH}/diago_iter_assist.cpp
     ${HSOLVER_PATH}/kernels/dngvd_op.cpp
-    ${HSOLVER_PATH}/kernels/math_kernel_op.cpp
+    ${BASE_PATH}/kernels/math_kernel_op.cpp
     ${BASE_PATH}/kernels/math_op.cpp
     ${BASE_PATH}/module_device/device.cpp
     ${BASE_PATH}/module_device/memory_op.cpp
 
@@ -1,6 +1,7 @@
 list(APPEND pymodule_base
     ${PROJECT_SOURCE_DIR}/src/ModuleBase/py_base_math.cpp
     ${BASE_PATH}/kernels/math_op.cpp
+    ${BASE_PATH}/kernels/math_kernel_op.cpp
     ${BASE_PATH}/module_device/memory_op.cpp
     ${BASE_PATH}/module_device/device.cpp
     )
 
@@ -14,6 +14,7 @@ list(APPEND _naos
     ${NAO_PATH}/two_center_table.cpp
     # dependency
     ${ABACUS_SOURCE_DIR}/module_base/kernels/math_op.cpp
+    ${ABACUS_SOURCE_DIR}/module_base/kernels/math_kernel_op.cpp
     # ${ABACUS_SOURCE_DIR}/module_psi/kernels/psi_memory_op.cpp
     ${ABACUS_SOURCE_DIR}/module_base/module_device/memory_op.cpp
     ${ABACUS_SOURCE_DIR}/module_base/module_device/device.cpp
 
@@ -10,8 +10,8 @@ list(APPEND _diago
 
 
     ${HSOLVER_PATH}/kernels/dngvd_op.cpp
-    ${HSOLVER_PATH}/kernels/math_kernel_op.cpp
     # dependency
+    ${BASE_PATH}/kernels/math_kernel_op.cpp
     ${BASE_PATH}/kernels/math_op.cpp
     ${BASE_PATH}/module_device/device.cpp
     ${BASE_PATH}/module_device/memory_op.cpp
 
@@ -6,7 +6,7 @@
 #include <pybind11/numpy.h>
 
 #include "module_hsolver/diago_dav_subspace.h"
-#include "module_hsolver/kernels/math_kernel_op.h"
+#include "module_base/kernels/math_kernel_op.h"
 #include "module_base/module_device/types.h"
 
 #include "./py_diago_dav_subspace.hpp"
 
@@ -36,14 +36,14 @@ list(APPEND device_srcs
   module_hamilt_pw/hamilt_stodft/kernels/hpsi_norm_op.cpp
   module_basis/module_pw/kernels/pw_op.cpp
   module_hsolver/kernels/dngvd_op.cpp
-  module_hsolver/kernels/math_kernel_op.cpp
   module_elecstate/kernels/elecstate_op.cpp
 
   # module_psi/kernels/psi_memory_op.cpp
   # module_psi/kernels/device.cpp
 
   module_base/module_device/device.cpp
   module_base/module_device/memory_op.cpp
+  module_base/kernels/math_kernel_op.cpp
 
   module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
   module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
@@ -64,7 +64,6 @@ if(USE_CUDA)
     module_hamilt_pw/hamilt_pwdft/kernels/cuda/onsite_op.cu
     module_basis/module_pw/kernels/cuda/pw_op.cu
     module_hsolver/kernels/cuda/dngvd_op.cu
-    module_hsolver/kernels/cuda/math_kernel_op.cu
     module_elecstate/kernels/cuda/elecstate_op.cu
 
     # module_psi/kernels/cuda/memory_op.cu
@@ -75,6 +74,7 @@ if(USE_CUDA)
     module_hamilt_pw/hamilt_pwdft/kernels/cuda/wf_op.cu
     module_hamilt_pw/hamilt_pwdft/kernels/cuda/vnl_op.cu
     module_base/kernels/cuda/math_op.cu
+    module_base/kernels/cuda/math_kernel_op.cu
     module_hamilt_general/module_xc/kernels/cuda/xc_functional_op.cu
   )
 endif()
@@ -89,7 +89,6 @@ if(USE_ROCM)
     module_hamilt_pw/hamilt_stodft/kernels/rocm/hpsi_norm_op.hip.cu
     module_basis/module_pw/kernels/rocm/pw_op.hip.cu
     module_hsolver/kernels/rocm/dngvd_op.hip.cu
-    module_hsolver/kernels/rocm/math_kernel_op.hip.cu
     module_elecstate/kernels/rocm/elecstate_op.hip.cu
 
     # module_psi/kernels/rocm/memory_op.hip.cu
@@ -99,6 +98,7 @@ if(USE_ROCM)
     module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
     module_hamilt_pw/hamilt_pwdft/kernels/rocm/wf_op.hip.cu
     module_hamilt_pw/hamilt_pwdft/kernels/rocm/vnl_op.hip.cu
+    module_base/kernels/rocm/math_kernel_op.hip.cu
     module_base/kernels/rocm/math_op.hip.cu
     module_hamilt_general/module_xc/kernels/rocm/xc_functional_op.hip.cu
   )
 
@@ -146,11 +146,13 @@ OBJS_BASE=abfs-vector3_order.o\
     math_bspline.o\
     math_chebyshev.o\
     math_op.o\
+    math_kernel_op.o\
     mathzone_add1.o\
     matrix.o\
     matrix3.o\
     memory.o\
     mymath.o\
+    para_gemm.o\
     realarray.o\
     sph_bessel_recursive-d1.o\
     sph_bessel_recursive-d2.o\
@@ -336,7 +338,6 @@ OBJS_HSOLVER=diago_cg.o\
     hsolver_lcaopw.o\
     hsolver_pw_sdft.o\
     diago_iter_assist.o\
-    math_kernel_op.o\
     dngvd_op.o\
     diag_const_nums.o\
     diag_hs_para.o\
 
@@ -37,6 +37,7 @@ add_library(
     mymath.cpp
     opt_CG.cpp
     opt_DCsrch.cpp
+    para_gemm.cpp
     realarray.cpp
     sph_bessel_recursive-d1.cpp
     sph_bessel_recursive-d2.cpp
 
@@ -10,7 +10,7 @@
 #include <base/macros/macros.h>
 #include <cuda_runtime.h>
 #include "cublas_v2.h"
-#include "module_hsolver/kernels/math_kernel_op.h"
+#include "module_base/kernels/math_kernel_op.h"
 #include "module_base/module_device/memory_op.h"
 
 
@@ -668,7 +668,7 @@ void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vec
 	}
 	else if (device_type == base_device::AbacusDevice_t::GpuDevice){
 #ifdef __CUDA
-		hsolver::vector_mul_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);
+		ModuleBase::vector_mul_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);
 #endif
 	}
 }
@@ -688,7 +688,7 @@ void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vec
 	}
 	else if (device_type == base_device::AbacusDevice_t::GpuDevice){
 #ifdef __CUDA
-		hsolver::vector_div_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);
+		ModuleBase::vector_div_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);
 #endif
 	}
 }
@@ -706,7 +706,7 @@ void vector_add_vector(const int& dim, float *result, const float *vector1, cons
 	}
 	else if (device_type == base_device::GpuDevice){
 #ifdef __CUDA
-		hsolver::constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
+		ModuleBase::constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
 #endif
 	}
 }
@@ -724,7 +724,7 @@ void vector_add_vector(const int& dim, double *result, const double *vector1, co
 	}
 	else if (device_type == base_device::GpuDevice){
 #ifdef __CUDA
-		hsolver::constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
+		ModuleBase::constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
 #endif
 	}
 }
@@ -742,7 +742,7 @@ void vector_add_vector(const int& dim, std::complex<float> *result, const std::c
 	}
 	else if (device_type == base_device::GpuDevice){
 #ifdef __CUDA
-		hsolver::constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
+		ModuleBase::constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
 #endif
 	}
 }
@@ -760,7 +760,7 @@ void vector_add_vector(const int& dim, std::complex<double> *result, const std::
 	}
 	else if (device_type == base_device::GpuDevice){
 #ifdef __CUDA
-		hsolver::constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
+		ModuleBase::constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);
 #endif
 	}
 }
@@ -1,5 +1,5 @@
 #include "module_base/module_device/memory_op.h"
-#include "module_hsolver/kernels/math_kernel_op.h"
+#include "module_base/kernels/math_kernel_op.h"
 #include "module_psi/psi.h"
 #include "module_base/tool_quit.h"
 
@@ -9,7 +9,7 @@
 #include <thrust/execution_policy.h>
 #include <thrust/inner_product.h>
 
-namespace hsolver
+namespace ModuleBase
 {
 const int warp_size = 32;
 // const unsigned int full_mask = 0xffffffff;
@@ -24,7 +24,7 @@ template <>
 struct GetTypeReal<thrust::complex<double>> {
     using type = double; /**< The return type specialization for std::complex<double>. */
 };
-namespace hsolver {
+namespace ModuleBase {
 template <typename T>
 struct GetTypeThrust {
     using type = T;
@@ -817,6 +817,27 @@ void scal_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DEV
     cublasErrcheck(cublasZscal(cublas_handle, N, (double2*)alpha, (double2*)X, incx));
 }
 
+template <>
+void gemm_op<float, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
+                                                         const char& transa,
+                                                         const char& transb,
+                                                         const int& m,
+                                                         const int& n,
+                                                         const int& k,
+                                                         const float* alpha,
+                                                         const float* a,
+                                                         const int& lda,
+                                                         const float* b,
+                                                         const int& ldb,
+                                                         const float* beta,
+                                                         float* c,
+                                                         const int& ldc)
+{
+    cublasOperation_t cutransA = judge_trans_op(false, transa, "gemm_op");
+    cublasOperation_t cutransB = judge_trans_op(false, transb, "gemm_op");
+    cublasErrcheck(cublasSgemm(cublas_handle, cutransA, cutransB, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc));
+}
+
 template <>
 void gemm_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
                                                           const char& transa,
@@ -1060,4 +1081,4 @@ template struct vector_div_vector_op<double, base_device::DEVICE_GPU>;
 template struct matrixSetToAnother<double, base_device::DEVICE_GPU>;
 template struct constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>;
 #endif
-}  // namespace hsolver
+}  // namespace ModuleBase
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`	`1`	`list(APPEND pymodule_base`
`2`	`2`	`${PROJECT_SOURCE_DIR}/src/ModuleBase/py_base_math.cpp`
`3`	`3`	`${BASE_PATH}/kernels/math_op.cpp`
	`4`	`+ ${BASE_PATH}/kernels/math_kernel_op.cpp`
`4`	`5`	`${BASE_PATH}/module_device/memory_op.cpp`
`5`	`6`	`${BASE_PATH}/module_device/device.cpp`
`6`	`7`	`)`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`	`#include <base/macros/macros.h>`
`11`	`11`	`#include <cuda_runtime.h>`
`12`	`12`	`#include "cublas_v2.h"`
`13`		`-#include "module_hsolver/kernels/math_kernel_op.h"`
	`13`	`+#include "module_base/kernels/math_kernel_op.h"`
`14`	`14`	`#include "module_base/module_device/memory_op.h"`
`15`	`15`
`16`	`16`
`@@ -668,7 +668,7 @@ void vector_mul_vector(const int& dim, T* result, const T* vector1, const T* vec`
`668`	`668`	`}`
`669`	`669`	`else if (device_type == base_device::AbacusDevice_t::GpuDevice){`
`670`	`670`	`#ifdef __CUDA`
`671`		`- hsolver::vector_mul_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);`
	`671`	`+ ModuleBase::vector_mul_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);`
`672`	`672`	`#endif`
`673`	`673`	`}`
`674`	`674`	`}`
`@@ -688,7 +688,7 @@ void vector_div_vector(const int& dim, T* result, const T* vector1, const T* vec`
`688`	`688`	`}`
`689`	`689`	`else if (device_type == base_device::AbacusDevice_t::GpuDevice){`
`690`	`690`	`#ifdef __CUDA`
`691`		`- hsolver::vector_div_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);`
	`691`	`+ ModuleBase::vector_div_vector_op<T, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, vector2);`
`692`	`692`	`#endif`
`693`	`693`	`}`
`694`	`694`	`}`
`@@ -706,7 +706,7 @@ void vector_add_vector(const int& dim, float result, const float vector1, cons`
`706`	`706`	`}`
`707`	`707`	`else if (device_type == base_device::GpuDevice){`
`708`	`708`	`#ifdef __CUDA`
`709`		`- hsolver::constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);`
	`709`	`+ ModuleBase::constantvector_addORsub_constantVector_op<float, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);`
`710`	`710`	`#endif`
`711`	`711`	`}`
`712`	`712`	`}`
`@@ -724,7 +724,7 @@ void vector_add_vector(const int& dim, double result, const double vector1, co`
`724`	`724`	`}`
`725`	`725`	`else if (device_type == base_device::GpuDevice){`
`726`	`726`	`#ifdef __CUDA`
`727`		`- hsolver::constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);`
	`727`	`+ ModuleBase::constantvector_addORsub_constantVector_op<double, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);`
`728`	`728`	`#endif`
`729`	`729`	`}`
`730`	`730`	`}`
`@@ -742,7 +742,7 @@ void vector_add_vector(const int& dim, std::complex<float> *result, const std::c`
`742`	`742`	`}`
`743`	`743`	`else if (device_type == base_device::GpuDevice){`
`744`	`744`	`#ifdef __CUDA`
`745`		`- hsolver::constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);`
	`745`	`+ ModuleBase::constantvector_addORsub_constantVector_op<std::complex<float>, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);`
`746`	`746`	`#endif`
`747`	`747`	`}`
`748`	`748`	`}`
`@@ -760,7 +760,7 @@ void vector_add_vector(const int& dim, std::complex<double> *result, const std::`
`760`	`760`	`}`
`761`	`761`	`else if (device_type == base_device::GpuDevice){`
`762`	`762`	`#ifdef __CUDA`
`763`		`- hsolver::constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);`
	`763`	`+ ModuleBase::constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>()(gpu_ctx, dim, result, vector1, constant1, vector2, constant2);`
`764`	`764`	`#endif`
`765`	`765`	`}`
`766`	`766`	`}`