fix compile

Qianruipku · Qianruipku · commit 57cecaeb4299 · 2025-01-19T17:41:16.000+08:00
diff --git a/source/module_base/kernels/cuda/math_kernel_op.cu b/source/module_base/kernels/cuda/math_kernel_op.cu
@@ -1,5 +1,5 @@
 #include "module_base/module_device/memory_op.h"
-#include "module_ModuleBase/kernels/math_kernel_op.h"
+#include "module_base/kernels/math_kernel_op.h"
 #include "module_psi/psi.h"
 #include "module_base/tool_quit.h"
 
@@ -817,6 +817,27 @@ void scal_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DEV
     cublasErrcheck(cublasZscal(cublas_handle, N, (double2*)alpha, (double2*)X, incx));
 }
 
+template <>
+void gemm_op<float, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
+                                                         const char& transa,
+                                                         const char& transb,
+                                                         const int& m,
+                                                         const int& n,
+                                                         const int& k,
+                                                         const float* alpha,
+                                                         const float* a,
+                                                         const int& lda,
+                                                         const float* b,
+                                                         const int& ldb,
+                                                         const float* beta,
+                                                         float* c,
+                                                         const int& ldc)
+{
+    cublasOperation_t cutransA = judge_trans_op(false, transa, "gemm_op");
+    cublasOperation_t cutransB = judge_trans_op(false, transb, "gemm_op");
+    cublasErrcheck(cublasSgemm(cublas_handle, cutransA, cutransB, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc));
+}
+
 template <>
 void gemm_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
                                                           const char& transa,
diff --git a/source/module_base/kernels/rocm/math_kernel_op.hip.cu b/source/module_base/kernels/rocm/math_kernel_op.hip.cu
@@ -1,5 +1,5 @@
 #include "module_base/module_device/memory_op.h"
-#include "module_ModuleBase/kernels/math_kernel_op.h"
+#include "module_base/kernels/math_kernel_op.h"
 #include "module_psi/psi.h"
 #include "module_base/tool_quit.h"
 
@@ -735,6 +735,27 @@ void scal_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DEV
     hipblasErrcheck(hipblasZscal(cublas_handle, N, (hipblasDoubleComplex*)alpha, (hipblasDoubleComplex*)X, incx));
 }
 
+template <>
+void gemm_op<float, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
+                                                         const char& transa,
+                                                         const char& transb,
+                                                         const int& m,
+                                                         const int& n,
+                                                         const int& k,
+                                                         const float* alpha,
+                                                         const float* a,
+                                                         const int& lda,
+                                                         const float* b,
+                                                         const int& ldb,
+                                                         const float* beta,
+                                                         float* c,
+                                                         const int& ldc)
+{
+    hipblasOperation_t cutransA = judge_trans_op(false, transa, "gemm_op");
+    hipblasOperation_t cutransB = judge_trans_op(false, transb, "gemm_op");
+    hipblasErrcheck(hipblasSgemm(cublas_handle, cutransA, cutransB, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc));
+}
+
 template <>
 void gemm_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* d,
                                                           const char& transa,
diff --git a/source/module_base/kernels/test/math_kernel_test.cpp b/source/module_base/kernels/test/math_kernel_test.cpp
@@ -375,9 +375,9 @@ TEST_F(TestModuleHsolverMathKernel, zdot_real_op_gpu)
     resize_memory_op()(psi_R_dev, psi_R.size());
     synchronize_memory_op()(psi_L_dev, psi_L.data(), psi_L.size());
     synchronize_memory_op()(psi_R_dev, psi_R.data(), psi_R.size());
-    hsolver::createGpuBlasHandle();
+    ModuleBase::createGpuBlasHandle();
     double result = zdot_real_gpu_op()(gpu_ctx, dim, psi_L_dev, psi_R_dev, false);
-    hsolver::destoryBLAShandle();
+    ModuleBase::destoryBLAShandle();
     EXPECT_LT(fabs(result - expected_result), 1e-12);
     delete_memory_op()(psi_L_dev);
     delete_memory_op()(psi_R_dev);
@@ -537,9 +537,9 @@ TEST_F(TestModuleHsolverMathKernel, axpy_op_gpu)
     synchronize_memory_op()(Y_axpy_dev, Y_axpy.data(), Y_axpy.size());
 
     // run
-    hsolver::createGpuBlasHandle();
+    ModuleBase::createGpuBlasHandle();
     axpy_op_gpu()(gpu_ctx, dim, &alpha_axpy, X_axpy_dev, 1, Y_axpy_dev, 1);
-    hsolver::destoryBLAShandle();
+    ModuleBase::destoryBLAShandle();
 
     // syn the output data in GPU to CPU
     synchronize_memory_op_gpu()(Y_axpy.data(), Y_axpy_dev, Y_axpy.size());
@@ -566,9 +566,9 @@ TEST_F(TestModuleHsolverMathKernel, scal_op_gpu)
     synchronize_memory_op()(X_scal_dev, X_scal.data(), X_scal.size());
 
     // run
-    hsolver::createGpuBlasHandle();
+    ModuleBase::createGpuBlasHandle();
     scal_op_gpu()(gpu_ctx, dim, &alpha_scal, X_scal_dev, 1);
-    hsolver::destoryBLAShandle();
+    ModuleBase::destoryBLAShandle();
 
     // syn the output data in GPU to CPU
     synchronize_memory_op_gpu()(X_scal.data(), X_scal_dev, X_scal.size());
@@ -599,9 +599,9 @@ TEST_F(TestModuleHsolverMathKernel, gemv_op_gpu)
     synchronize_memory_op()(Y_gemv_dev, Y_gemv.data(), Y_gemv.size());
 
     // run
-    hsolver::createGpuBlasHandle();
+    ModuleBase::createGpuBlasHandle();
     gemv_op_gpu()(gpu_ctx, 'C', 2, 3, &ModuleBase::ONE, A_gemv_dev, 2, X_gemv_dev, 1, &ModuleBase::ONE, Y_gemv_dev, 1);
-    hsolver::destoryBLAShandle();
+    ModuleBase::destoryBLAShandle();
     // syn the output data in GPU to CPU
     synchronize_memory_op_gpu()(Y_gemv.data(), Y_gemv_dev, Y_gemv.size());
 
diff --git a/source/module_base/para_gemm.cpp b/source/module_base/para_gemm.cpp
@@ -189,9 +189,11 @@ void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T
         }
     }
     else
-#endif
     {
         T real_beta = row_rank == 0 ? beta : 0;
+#else
+        T real_beta = beta;
+#endif
         ModuleBase::gemm_op<T, Device>()(ctx,
                                          'C',
                                          'N',
@@ -206,8 +208,10 @@ void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T
                                          &real_beta,
                                          C_global,
                                          LDC_global);
+#ifdef __MPI
         Parallel_Common::reduce_dev<T, Device>(C_global, size_C_global, row_world);
     }
+#endif
 }
 
 template class PGemmCN<double, base_device::DEVICE_CPU>;
diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp
@@ -73,7 +73,7 @@ ESolver_KS_PW<T, Device>::ESolver_KS_PW()
 #if ((defined __CUDA) || (defined __ROCM))
     if (this->device == base_device::GpuDevice)
     {
-        hsolver::createGpuBlasHandle();
+        ModuleBase::createGpuBlasHandle();
         hsolver::createGpuSolverHandle();
         container::kernels::createGpuBlasHandle();
         container::kernels::createGpuSolverHandle();
@@ -101,7 +101,7 @@ ESolver_KS_PW<T, Device>::~ESolver_KS_PW()
     if (this->device == base_device::GpuDevice)
     {
 #if defined(__CUDA) || defined(__ROCM)
-        hsolver::destoryBLAShandle();
+        ModuleBase::destoryBLAShandle();
         hsolver::destroyGpuSolverHandle();
         container::kernels::destroyGpuBlasHandle();
         container::kernels::destroyGpuSolverHandle();
diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp
@@ -354,11 +354,11 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
         if (this->device == base_device::GpuDevice)
         {
             syncmem_var_h2d_op()(this->d_precondition, pre.data(), this->dim);
-            vector_div_vector_op<T, Device>()(this->ctx,
-                                              this->dim,
-                                              psi_iter + (nbase + m) * this->dim,
-                                              psi_iter + (nbase + m) * this->dim,
-                                              this->d_precondition);
+            ModuleBase::vector_div_vector_op<T, Device>()(this->ctx,
+                                                          this->dim,
+                                                          psi_iter + (nbase + m) * this->dim,
+                                                          psi_iter + (nbase + m) * this->dim,
+                                                          this->d_precondition);
         }
         else
 #endif
diff --git a/source/module_hsolver/diago_david.cpp b/source/module_hsolver/diago_david.cpp
@@ -416,11 +416,11 @@ void DiagoDavid<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
             Real* e_temp_gpu = nullptr;
             resmem_var_op()(e_temp_gpu, nbase);
             syncmem_var_h2d_op()(e_temp_gpu, e_temp_cpu.data(), nbase);
-            vector_mul_vector_op<T, Device>()(this->ctx,
-                                                   nbase,
-                                                   vc_ev_vector + m * nbase,
-                                                   vc_ev_vector + m * nbase,
-                                                   e_temp_gpu);
+            ModuleBase::vector_mul_vector_op<T, Device>()(this->ctx,
+                                                          nbase,
+                                                          vc_ev_vector + m * nbase,
+                                                          vc_ev_vector + m * nbase,
+                                                          e_temp_gpu);
             delmem_var_op()(e_temp_gpu);
 #endif
         }
@@ -468,11 +468,11 @@ void DiagoDavid<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
         if (this->device == base_device::GpuDevice)
         {
 #if defined(__CUDA) || defined(__ROCM)
-            vector_div_vector_op<T, Device>()(this->ctx,
-                                                   dim,
-                                                   basis + dim*(nbase + m),
-                                                   basis + dim*(nbase + m),
-                                                   this->d_precondition);
+            ModuleBase::vector_div_vector_op<T, Device>()(this->ctx,
+                                                          dim,
+                                                          basis + dim * (nbase + m),
+                                                          basis + dim * (nbase + m),
+                                                          this->d_precondition);
 #endif
         }
         else
diff --git a/source/module_hsolver/kernels/test/math_dngvd_test.cpp b/source/module_hsolver/kernels/test/math_dngvd_test.cpp
@@ -144,13 +144,13 @@ TEST_F(TestModuleHsolverMathDngvd, transpose_gpu)
     synchronize_memory_op_C2G_Z()(device_transpose, transpose.data(), transpose.size());
 
     // run
-    hsolver::createGpuBlasHandle();
+    ModuleBase::createGpuBlasHandle();
     ModuleBase::matrixTranspose_op<std::complex<double>, base_device::DEVICE_GPU>()(gpu_ctx,
                                                                                  2,
                                                                                  3,
                                                                                  device_transpose,
                                                                                  device_transpose);
-    hsolver::destoryBLAShandle();
+    ModuleBase::destoryBLAShandle();
 
     // copy transpose data from GPU to CPU
     std::vector<std::complex<double>> transpose_result = {
diff --git a/source/module_hsolver/kernels/test/perf_math_kernel.cpp b/source/module_hsolver/kernels/test/perf_math_kernel.cpp
@@ -114,7 +114,7 @@ class PerfModuleHsolverMathKernel : public benchmark::Fixture {
         resize_memory_op_double()(test_dvector_a_gpu, dim_vector);
         synchronize_memory_op_double()(test_dvector_a_gpu, test_dvector_a, dim_vector);
 
-        hsolver::createGpuBlasHandle();
+        ModuleBase::createGpuBlasHandle();
 
 
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
@@ -125,7 +125,7 @@ class PerfModuleHsolverMathKernel : public benchmark::Fixture {
         delete[] result_zvector;
         delete[] test_dvector_a;
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
-        hsolver::destoryBLAShandle();
+        ModuleBase::destoryBLAShandle();
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
     }
 

Original file line number	Diff line number	Diff line change
`@@ -189,9 +189,11 @@ void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T`
`189`	`189`	`}`
`190`	`190`	`}`
`191`	`191`	`else`
`192`		`-#endif`
`193`	`192`	`{`
`194`	`193`	`T real_beta = row_rank == 0 ? beta : 0;`
	`194`	`+#else`
	`195`	`+ T real_beta = beta;`
	`196`	`+#endif`
`195`	`197`	`ModuleBase::gemm_op<T, Device>()(ctx,`
`196`	`198`	`'C',`
`197`	`199`	`'N',`
`@@ -206,8 +208,10 @@ void PGemmCN<T, Device>::multiply(const T alpha, const T* A, const T* B, const T`
`206`	`208`	`&real_beta,`
`207`	`209`	`C_global,`
`208`	`210`	`LDC_global);`
	`211`	`+#ifdef __MPI`
`209`	`212`	`Parallel_Common::reduce_dev<T, Device>(C_global, size_C_global, row_world);`
`210`	`213`	`}`
	`214`	`+#endif`
`211`	`215`	`}`
`212`	`216`
`213`	`217`	`template class PGemmCN<double, base_device::DEVICE_CPU>;`