kluonj
diff --git a/‎source/CMakeLists.txt‎
Lines changed: 3 additions & 3 deletions b/‎source/CMakeLists.txt‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎source/source_base/CMakeLists.txt‎
Lines changed: 2 additions & 4 deletions b/‎source/source_base/CMakeLists.txt‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎source/source_base/cubic_spline.cpp‎
Lines changed: 11 additions & 3 deletions b/‎source/source_base/cubic_spline.cpp‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎source/source_base/gather_math_lib_info.cpp‎
Lines changed: 34 additions & 33 deletions b/‎source/source_base/gather_math_lib_info.cpp‎
Lines changed: 34 additions & 33 deletions
diff --git a/‎source/source_base/global_function.h‎
Lines changed: 11 additions & 2 deletions b/‎source/source_base/global_function.h‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎source/source_base/inverse_matrix.cpp‎
Lines changed: 28 additions & 4 deletions b/‎source/source_base/inverse_matrix.cpp‎
Lines changed: 28 additions & 4 deletions
diff --git a/‎source/source_base/inverse_matrix.h‎
Lines changed: 4 additions & 1 deletion b/‎source/source_base/inverse_matrix.h‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎source/source_base/kernels/cuda/math_kernel_op.cu‎
Lines changed: 43 additions & 1 deletion b/‎source/source_base/kernels/cuda/math_kernel_op.cu‎
Lines changed: 43 additions & 1 deletion
@@ -35,7 +35,7 @@ list(APPEND device_srcs
   source_pw/module_pwdft/kernels/meta_op.cpp
   source_pw/module_stodft/kernels/hpsi_norm_op.cpp
   source_basis/module_pw/kernels/pw_op.cpp
-  source_hsolver/kernels/dngvd_op.cpp
+  source_hsolver/kernels/hegvd_op.cpp
   source_hsolver/kernels/bpcg_kernel_op.cpp
   source_estate/kernels/elecstate_op.cpp
 
@@ -70,7 +70,7 @@ if(USE_CUDA)
     source_pw/module_stodft/kernels/cuda/hpsi_norm_op.cu
     source_pw/module_pwdft/kernels/cuda/onsite_op.cu
     source_basis/module_pw/kernels/cuda/pw_op.cu
-    source_hsolver/kernels/cuda/dngvd_op.cu
+    source_hsolver/kernels/cuda/hegvd_op.cu
     source_hsolver/kernels/cuda/bpcg_kernel_op.cu
     source_estate/kernels/cuda/elecstate_op.cu
 
@@ -101,7 +101,7 @@ if(USE_ROCM)
     source_pw/module_pwdft/kernels/rocm/onsite_op.hip.cu
     source_pw/module_stodft/kernels/rocm/hpsi_norm_op.hip.cu
     source_basis/module_pw/kernels/rocm/pw_op.hip.cu
-    source_hsolver/kernels/rocm/dngvd_op.hip.cu
+    source_hsolver/kernels/rocm/hegvd_op.hip.cu
     source_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu
     source_estate/kernels/rocm/elecstate_op.hip.cu
 
 
@@ -11,10 +11,8 @@ add_library(
     OBJECT
     assoc_laguerre.cpp
     module_external/blas_connector_base.cpp
-    module_external/blas_connector_l1.cpp
-    module_external/blas_connector_l2.cpp
-    module_external/blas_connector_l3.cpp
-    module_external/lapack_connector.cpp
+    module_external/blas_connector_vector.cpp
+    module_external/blas_connector_matrix.cpp
     clebsch_gordan_coeff.cpp
     complexarray.cpp
     complexmatrix.cpp
 
@@ -1,5 +1,4 @@
 #include "cubic_spline.h"
-#include "source_base/module_external/lapack_connector.h"
 
 #include <cassert>
 #include <algorithm>
@@ -9,6 +8,13 @@
 
 using ModuleBase::CubicSpline;
 
+extern "C"
+{
+    // solve a tridiagonal linear system
+    void dgtsv_(int* N, int* NRHS, double* DL, double* D, double* DU, double* B, int* LDB, int* INFO);
+};
+
+
 CubicSpline::BoundaryCondition::BoundaryCondition(BoundaryType type)
     : type(type)
 {
@@ -471,7 +477,8 @@ void CubicSpline::_build(
 
             int nrhs = 1;
             int ldb = n;
-            LapackConnector::gtsv(LapackConnector::ColMajor, n, nrhs, l, d, u, dy, ldb);
+            int info = 0;
+            dgtsv_(&n, &nrhs, l, d, u, dy, &ldb, &info);
         }
     }
 }
@@ -545,8 +552,9 @@ void CubicSpline::_solve_cyctri(int n, double* d, double* u, double* l, double*
     d[n - 1] -= l[n - 1] * alpha / beta;
 
     int nrhs = 2;
+    int info = 0;
     int ldb = n;
-    LapackConnector::gtsv(LapackConnector::ColMajor, n, nrhs, l, d, u, bp.data(), ldb);
+    dgtsv_(&n, &nrhs, l, d, u, bp.data(), &ldb, &info);
 
     double fac = (beta * u[n - 1] * bp[0] + alpha * l[n - 1] * bp[n - 1])
                  / (1. + beta * u[n - 1] * bp[n] + alpha * l[n - 1] * bp[2 * n - 1]);
 
@@ -30,7 +30,7 @@ void zgemm_i(const char *transa,
     GlobalV::ofs_info.unsetf(std::ios_base::floatfield);
     GlobalV::ofs_info << "zgemm " << *transa << " " << *transb << " " << *m << " " << *n << " "
                       << *k << " " << *alpha << " " << *lda << " " << *ldb << " " << *beta << " " << *ldc << std::endl;
-    BlasConnector::gemm_cm(*transa, *transb, *m, *n, *k, *alpha, a, *lda, b, *ldb, *beta, c, *ldc);
+    zgemm_(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void zaxpy_i(const int *N,
@@ -43,37 +43,38 @@ void zaxpy_i(const int *N,
     // std::cout << "zaxpy " << *N << std::endl;
     // alpha is a coefficient
     // incX, incY is always 1
-    BlasConnector::axpy(*N, *alpha, X, *incX, Y, *incY);
+    zaxpy_(N, alpha, X, incX, Y, incY);
 }
 
-// void zhegvx_i(const int *itype,
-//               const char *jobz,
-//               const char *range,
-//               const char *uplo,
-//               const int *n,
-//               std::complex<double> *a,
-//               const int *lda,
-//               std::complex<double> *b,
-//               const int *ldb,
-//               const double *vl,
-//               const double *vu,
-//               const int *il,
-//               const int *iu,
-//               const double *abstol,
-//               int *m,
-//               double *w,
-//               std::complex<double> *z,
-//               const int *ldz,
-//               std::complex<double> *work,
-//               const int *lwork,
-//               double *rwork,
-//               int *iwork,
-//               int *ifail,
-//               int *info)
-// {
-//     GlobalV::ofs_info.unsetf(std::ios_base::floatfield);
-//     GlobalV::ofs_info << "zhegvx " << *itype << " " << *jobz << " " << *range << " " << *uplo
-//                       << " " << *n << " " << *lda << " " << *ldb << " " << *vl << " " << *vu << " " << *il << " " << *iu
-//                       << " " << *abstol << " " << *m << " " << *lwork << " " << *info << std::endl;
-//     LapackConnector::hegvx(LapackConnector::ColMajor, *itype, *jobz, *range, *uplo, *n, a, *lda, b, *ldb, *vl, *vu, *il, *iu, *abstol, m, w, z, *ldz, ifail);
-// }
+void zhegvx_i(const int *itype,
+              const char *jobz,
+              const char *range,
+              const char *uplo,
+              const int *n,
+              std::complex<double> *a,
+              const int *lda,
+              std::complex<double> *b,
+              const int *ldb,
+              const double *vl,
+              const double *vu,
+              const int *il,
+              const int *iu,
+              const double *abstol,
+              const int *m,
+              double *w,
+              std::complex<double> *z,
+              const int *ldz,
+              std::complex<double> *work,
+              const int *lwork,
+              double *rwork,
+              int *iwork,
+              int *ifail,
+              int *info)
+{
+    GlobalV::ofs_info.unsetf(std::ios_base::floatfield);
+    GlobalV::ofs_info << "zhegvx " << *itype << " " << *jobz << " " << *range << " " << *uplo
+                      << " " << *n << " " << *lda << " " << *ldb << " " << *vl << " " << *vu << " " << *il << " " << *iu
+                      << " " << *abstol << " " << *m << " " << *lwork << " " << *info << std::endl;
+    zhegvx_(itype, jobz, range, uplo, n, a, lda, b, ldb, vl, vu, il, iu, abstol, m, w, z, ldz, work, lwork, rwork,
+            iwork, ifail, info);
+}
@@ -182,12 +182,21 @@ inline void DCOPY(const T* a, T* b, const int& dim) {
 }
 
 template <typename T>
-inline void COPYARRAY(const T* a, T* b, const long dim)
+inline void COPYARRAY(const T* a, T* b, const long dim);
+
+template <>
+inline void COPYARRAY(const std::complex<double>* a, std::complex<double>* b, const long dim)
 {
     const int one = 1;
-    BlasConnector::copy(dim, a, one, b, one);
+    zcopy_(&dim, a, &one, b, &one);
 }
 
+template <>
+inline void COPYARRAY(const double* a, double* b, const long dim)
+{
+    const int one = 1;
+    dcopy_(&dim, a, &one, b, &one);
+}
 
 void BLOCK_HERE(const std::string& description);
 
 
@@ -16,6 +16,8 @@ Inverse_Matrix_Complex::~Inverse_Matrix_Complex()
 	if(allocate)
 	{
 		delete[] e; //mohan fix bug 2012-04-02
+		delete[] work2;
+		delete[] rwork;
 		allocate=false;
 	}	
 }
@@ -26,17 +28,23 @@ void Inverse_Matrix_Complex::init(const int &dim_in)
 	if(allocate)
 	{
 		delete[] e; //mohan fix bug 2012-04-02
+		delete[] work2;
+		delete[] rwork;
 		allocate=false;
 	}	
 
 	this->dim = dim_in;
 
 	assert(dim>0);
 	this->e = new double[dim];
+	this->lwork = 2*dim;
 
 	assert(lwork>0);
+	this->work2 = new std::complex<double>[lwork];
 
 	assert(3*dim-2>0);
+	this->rwork = new double[3*dim-2];
+	this->info = 0;
 	this->A.create(dim, dim);
 	this->EA.create(dim, dim);
 
@@ -51,7 +59,7 @@ void Inverse_Matrix_Complex::using_zheev( const ModuleBase::ComplexMatrix &Sin,
 	ModuleBase::timer::tick("Inverse","using_zheev");
 	this->A = Sin;
 
-    LapackConnector::heev(LapackConnector::RowMajor, 'V', 'U', dim, this->A.c, dim, e);
+    LapackConnector::zheev('V', 'U', dim, this->A, dim, e, work2, lwork, rwork, &info);
 
 	for(int i=0; i<dim; i++)
 	{
@@ -68,8 +76,11 @@ void Inverse_Matrix_Complex::using_zheev( const ModuleBase::ComplexMatrix &Sin,
 
 void Inverse_Matrix_Real(const int dim, const double* in, double* out)
 {
+    int info = 0;
     int lda = dim;
-    std::vector<int> ipiv(dim);
+    int lwork = 64 * dim;
+    int* ipiv = new int[dim];
+    double* work = new double[lwork];
 
     for (int i = 0; i < dim; i++)
     {
@@ -79,7 +90,20 @@ void Inverse_Matrix_Real(const int dim, const double* in, double* out)
         }
     }
 
-    LapackConnector::getrf(LapackConnector::ColMajor, dim, dim, out, lda, ipiv.data());
-    LapackConnector::getri(LapackConnector::ColMajor, dim, out, lda, ipiv.data());
+    dgetrf_(&dim, &dim, out, &lda, ipiv, &info);
+    if (info != 0)
+    {
+        std::cout << "ERROR: LAPACK dgetrf error, info = " << info << std::endl;
+        exit(1);
+    }
+    dgetri_(&dim, out, &lda, ipiv, work, &lwork, &info);
+    if (info != 0)
+    {
+        std::cout << "ERROR: LAPACK dgetri error, info = " << info << std::endl;
+        exit(1);
+    }
+
+    delete[] ipiv;
+    delete[] work;
 }
 }
@@ -20,9 +20,12 @@ class Inverse_Matrix_Complex
 	void init( const int &dim_in);
 
 	private:
-	int lwork;
 	int dim=0;
 	double *e=nullptr;
+	int lwork=0;
+	std::complex<double> *work2=nullptr;
+	double* rwork=nullptr;
+	int info=0;
 	bool allocate=false; //mohan add 2012-04-02
 
 	ModuleBase::ComplexMatrix EA;
 
@@ -133,6 +133,14 @@ __global__ void matrix_copy_kernel(const int n1, const int n2, const T* A, const
     }
 }
 
+template <typename T, typename Real>
+__global__ void matrix_multiply_vector_kernel(const int m, const int n, T *a, const int lda, const Real *b, const Real alpha, T *c, const int ldc){
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    int col = blockIdx.y * blockDim.y + threadIdx.y;
+    if (col >= n || row >= m) return;
+    c[col * ldc + row] = a[col * lda + row] * b[col] * alpha;
+}
+
 cublasOperation_t judge_trans_op(bool is_complex, const char& trans, const char* name)
 {
     if (trans == 'N')
@@ -147,7 +155,7 @@ cublasOperation_t judge_trans_op(bool is_complex, const char& trans, const char*
     {
         return CUBLAS_OP_C;
     }
-    else 
+    else
     {
         ModuleBase::WARNING_QUIT(name, std::string("Unknown trans type ") + trans + std::string(" !"));
     }
@@ -438,10 +446,44 @@ void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const
     cudaCheckOnDebug();
 }
 
+template <>
+void matrix_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
+                  double *a, const int &lda, const double *b, const double alpha, double *c, const int &ldc){
+    dim3 thread(16, 16, 1);
+    dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
+    matrix_multiply_vector_kernel<double, double> <<<block, thread >>>(m, n, a, lda,
+    b, alpha, c, ldc);
+    cudaCheckOnDebug();
+}
+
+template <>
+void matrix_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
+                  std::complex<float> *a, const int &lda, const float *b, const float alpha, std::complex<float> *c, const int &ldc){
+    dim3 thread(16, 16, 1);
+    dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
+    matrix_multiply_vector_kernel<thrust::complex<float>, float> <<<block, thread >>>(m, n, reinterpret_cast<thrust::complex<float>*>(a), lda,
+    b, alpha, reinterpret_cast<thrust::complex<float>*>(c), ldc);
+    cudaCheckOnDebug();
+}
+
+template <>
+void matrix_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
+                  std::complex<double> *a, const int &lda, const double *b, const double alpha, std::complex<double> *c, const int &ldc)
+{
+    dim3 thread(16, 16, 1);
+    dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
+    matrix_multiply_vector_kernel<thrust::complex<double>, double> <<<block, thread >>>(m, n, reinterpret_cast<thrust::complex<double>*>(a), lda,
+    b, alpha, reinterpret_cast<thrust::complex<double>*>(c), ldc);
+    cudaCheckOnDebug();
+}
 
 // Explicitly instantiate functors for the types of functor registered.
 
 template struct matrixCopy<std::complex<float>, base_device::DEVICE_GPU>;
 template struct matrixCopy<double, base_device::DEVICE_GPU>;
 template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
+
+template struct matrix_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct matrix_mul_vector_op<double, base_device::DEVICE_GPU>;
+template struct matrix_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 }  // namespace ModuleBase
Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,8 @@ Inverse_Matrix_Complex::~Inverse_Matrix_Complex()`
`16`	`16`	`if(allocate)`
`17`	`17`	`{`
`18`	`18`	`delete[] e; //mohan fix bug 2012-04-02`
	`19`	`+ delete[] work2;`
	`20`	`+ delete[] rwork;`
`19`	`21`	`allocate=false;`
`20`	`22`	`}`
`21`	`23`	`}`
`@@ -26,17 +28,23 @@ void Inverse_Matrix_Complex::init(const int &dim_in)`
`26`	`28`	`if(allocate)`
`27`	`29`	`{`
`28`	`30`	`delete[] e; //mohan fix bug 2012-04-02`
	`31`	`+ delete[] work2;`
	`32`	`+ delete[] rwork;`
`29`	`33`	`allocate=false;`
`30`	`34`	`}`
`31`	`35`
`32`	`36`	`this->dim = dim_in;`
`33`	`37`
`34`	`38`	`assert(dim>0);`
`35`	`39`	`this->e = new double[dim];`
	`40`	`+ this->lwork = 2*dim;`
`36`	`41`
`37`	`42`	`assert(lwork>0);`
	`43`	`+ this->work2 = new std::complex<double>[lwork];`
`38`	`44`
`39`	`45`	`assert(3*dim-2>0);`
	`46`	`+ this->rwork = new double[3*dim-2];`
	`47`	`+ this->info = 0;`
`40`	`48`	`this->A.create(dim, dim);`
`41`	`49`	`this->EA.create(dim, dim);`
`42`	`50`
`@@ -51,7 +59,7 @@ void Inverse_Matrix_Complex::using_zheev( const ModuleBase::ComplexMatrix &Sin,`
`51`	`59`	`ModuleBase::timer::tick("Inverse","using_zheev");`
`52`	`60`	`this->A = Sin;`
`53`	`61`
`54`		`- LapackConnector::heev(LapackConnector::RowMajor, 'V', 'U', dim, this->A.c, dim, e);`
	`62`	`+ LapackConnector::zheev('V', 'U', dim, this->A, dim, e, work2, lwork, rwork, &info);`
`55`	`63`
`56`	`64`	`for(int i=0; i<dim; i++)`
`57`	`65`	`{`
`@@ -68,8 +76,11 @@ void Inverse_Matrix_Complex::using_zheev( const ModuleBase::ComplexMatrix &Sin,`
`68`	`76`
`69`	`77`	`void Inverse_Matrix_Real(const int dim, const double* in, double* out)`
`70`	`78`	`{`
	`79`	`+ int info = 0;`
`71`	`80`	`int lda = dim;`
`72`		`- std::vector<int> ipiv(dim);`
	`81`	`+ int lwork = 64 * dim;`
	`82`	`+ int* ipiv = new int[dim];`
	`83`	`+ double* work = new double[lwork];`
`73`	`84`
`74`	`85`	`for (int i = 0; i < dim; i++)`
`75`	`86`	`{`
`@@ -79,7 +90,20 @@ void Inverse_Matrix_Real(const int dim, const double* in, double* out)`
`79`	`90`	`}`
`80`	`91`	`}`
`81`	`92`
`82`		`- LapackConnector::getrf(LapackConnector::ColMajor, dim, dim, out, lda, ipiv.data());`
`83`		`- LapackConnector::getri(LapackConnector::ColMajor, dim, out, lda, ipiv.data());`
	`93`	`+ dgetrf_(&dim, &dim, out, &lda, ipiv, &info);`
	`94`	`+ if (info != 0)`
	`95`	`+ {`
	`96`	`+ std::cout << "ERROR: LAPACK dgetrf error, info = " << info << std::endl;`
	`97`	`+ exit(1);`
	`98`	`+ }`
	`99`	`+ dgetri_(&dim, out, &lda, ipiv, work, &lwork, &info);`
	`100`	`+ if (info != 0)`
	`101`	`+ {`
	`102`	`+ std::cout << "ERROR: LAPACK dgetri error, info = " << info << std::endl;`
	`103`	`+ exit(1);`
	`104`	`+ }`
	`105`	`+`
	`106`	`+ delete[] ipiv;`
	`107`	`+ delete[] work;`
`84`	`108`	`}`
`85`	`109`	`}`