From 5e7cf0deff900d79c179456a509b9436684205e0 Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Fri, 17 Oct 2025 09:47:48 +0800 Subject: [PATCH 01/17] Update lapack_hegvd interface to support lda different than n --- .../ATen/kernels/cuda/lapack.cu | 31 ++- .../module_container/ATen/kernels/lapack.cpp | 118 +++++++- .../module_container/ATen/kernels/lapack.h | 66 ++++- .../ATen/kernels/rocm/lapack.hip.cu | 23 +- .../ATen/kernels/test/lapack_test.cpp | 34 +-- .../base/third_party/cusolver.h | 260 ++++++++++++++++-- source/source_hsolver/kernels/hegvd_op.cpp | 5 +- 7 files changed, 452 insertions(+), 85 deletions(-) diff --git a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu index fcdaa7e70a..33e1eef7f5 100644 --- a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu +++ b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu @@ -105,18 +105,37 @@ template struct lapack_hegvd { using Real = typename GetTypeReal::type; void operator()( - const int& itype, - const char& jobz, - const char& uplo, + const int dim, + const int lda, T* Mat_A, T* Mat_B, - const int& dim, - Real* eigen_val) + Real* eigen_val, + T *eigen_vec) { - cuSolverConnector::hegvd(cusolver_handle, itype, jobz, uplo, dim, Mat_A, dim, Mat_B, dim, eigen_val); + const int itype = 1; + const char jobz = 'V'; + const char uplo = 'U'; + cudaErrcheck(cudaMemcpy(eigen_vec, Mat_A, sizeof(T) * dim * lda, cudaMemcpyDeviceToDevice)); + cuSolverConnector::hegvd(cusolver_handle, itype, jobz, uplo, dim, eigen_vec, lda, Mat_B, lda, eigen_val); } }; +// template +// struct lapack_hegvx { +// using Real = typename GetTypeReal::type; +// void operator()( +// const int n, +// const int lda, +// T *A, +// T *B, +// const int m, +// Real *eigen_val, +// T *eigen_vec) +// { +// cuSolverConnector::hegvx(cusolver_handle, n, lda, A, B, m, eigen_val, eigen_vec); +// } +// }; +// template struct lapack_getrf { void operator()( diff --git a/source/source_base/module_container/ATen/kernels/lapack.cpp b/source/source_base/module_container/ATen/kernels/lapack.cpp index 21cbc46f96..90a3c5a0b6 100644 --- a/source/source_base/module_container/ATen/kernels/lapack.cpp +++ b/source/source_base/module_container/ATen/kernels/lapack.cpp @@ -2,6 +2,9 @@ #include +// #include // std::memcpy +#include // std::copy + namespace container { namespace kernels { @@ -36,7 +39,7 @@ struct lapack_trtri { const char& diag, const int& dim, T* Mat, - const int& lda) + const int& lda) { int info = 0; lapackConnector::trtri(uplo, diag, dim, Mat, lda, info); @@ -51,8 +54,8 @@ struct lapack_potrf { void operator()( const char& uplo, const int& dim, - T* Mat, - const int& lda) + T* Mat, + const int& lda) { int info = 0; lapackConnector::potrf(uplo, dim, Mat, dim, info); @@ -85,7 +88,7 @@ struct lapack_heevd { Tensor iwork(DataTypeToEnum::value, DeviceType::CpuDevice, {liwork}); iwork.zero(); - lapackConnector::heevd(jobz, uplo, dim, Mat, dim, eigen_val, work.data(), lwork, rwork.data(), lrwork, iwork.data(), liwork, info); + lapackConnector::heevd(jobz, uplo, dim, Mat, dim, eigen_val, work.data(), lwork, rwork.data(), lrwork, iwork.data(), liwork, info); if (info != 0) { throw std::runtime_error("heevd failed with info = " + std::to_string(info)); } @@ -96,14 +99,26 @@ template struct lapack_hegvd { using Real = typename GetTypeReal::type; void operator()( - const int& itype, - const char& jobz, - const char& uplo, - T* Mat_A, - T* Mat_B, - const int& dim, - Real* eigen_val) + const int dim, + const int lda, + T *Mat_A, + T *Mat_B, + Real *eigen_val, + T *eigen_vec) { + // first copy Mat_A to eigen_vec + // then pass as argument "A" in lapack hegvd + // and this block of memory will be overwritten by eigenvectors + // for (int i = 0; i < dim * lda; ++i){ + // eigen_vec[i] = Mat_A[i]; + // } + // std::memcpy(eigen_vec, Mat_A, sizeof(T) * dim * lda); + // eigen_vec = Mat_A + std::copy(Mat_A, Mat_A + dim*lda, eigen_vec); + + const int itype = 1; + const char jobz = 'V'; + const char uplo = 'U'; int info = 0; int lwork = std::max(2 * dim + dim * dim, 1 + 6 * dim + 2 * dim * dim); Tensor work(DataTypeToEnum::value, DeviceType::CpuDevice, {lwork}); @@ -117,13 +132,90 @@ struct lapack_hegvd { Tensor iwork(DataType::DT_INT, DeviceType::CpuDevice, {liwork}); iwork.zero(); - lapackConnector::hegvd(itype, jobz, uplo, dim, Mat_A, dim, Mat_B, dim, eigen_val, work.data(), lwork, rwork.data(), lrwork, iwork.data(), liwork, info); + // After this, eigen_vec will contain the matrix Z of eigenvectors + lapackConnector::hegvd(itype, jobz, uplo, dim, eigen_vec, lda, Mat_B, lda, eigen_val, work.data(), lwork, rwork.data(), lrwork, iwork.data(), liwork, info); if (info != 0) { throw std::runtime_error("hegvd failed with info = " + std::to_string(info)); } } }; + +// template +// struct lapack_hegvx { +// using Real = typename GetTypeReal::type; +// void operator()( +// const int n, +// const int lda, +// T *A, +// T *B, +// const int m, +// Real *eigen_val, +// T *eigen_vec) +// { +// int info = 0; + +// int mm = m; + +// int lwork = -1; + +// T *work = new T[1]; +// Real *rwork = new Real[7 * n]; +// int *iwork = new int[5 * n]; +// int *ifail = new int[n]; + +// // set lwork = -1 to query optimal work size +// lapackConnector::hegvx(1, // ITYPE = 1: A*x = (lambda)*B*x +// 'V', 'I', 'U', +// n, A, lda, B, lda, +// 0.0, 0.0, +// 1, m, 0.0, mm, +// eigen_val, eigen_vec, lda, +// work, +// lwork, // lwork = 1, query optimal size. +// rwork, iwork, ifail, +// info); + +// // !> If LWORK = -1, then a workspace query is assumed; the routine +// // !> only calculates the optimal size of the WORK array, returns +// // !> this value as the first entry of the WORK array. +// lwork = int(get_real(work[0])); +// delete[] work; +// work = new T[lwork]; + +// lapackConnector::hegvx( +// 1, // ITYPE = 1: A*x = (lambda)*B*x +// 'V', // JOBZ = 'V': Compute eigenvalues and eigenvectors. +// 'I', // RANGE = 'I': the IL-th through IU-th eigenvalues will be found. +// 'U', // UPLO = 'U': Upper triangles of A and B are stored. +// n, // order of the matrices A and B. +// A, // A is COMPLEX*16 array dimension (LDA, N) +// lda, // leading dimension of the array A. +// B, // B is COMPLEX*16 array, dimension (LDB, N) +// lda, // assume that leading dimension of B is the same as A. +// 0.0, // VL, Not referenced if RANGE = 'A' or 'I'. +// 0.0, // VU, Not referenced if RANGE = 'A' or 'I'. +// 1, // IL: If RANGE='I', the index of the smallest eigenvalue to be returned. 1 <= IL <= IU <= N, +// m, // IU: If RANGE='I', the index of the largest eigenvalue to be returned. 1 <= IL <= IU <= N, +// 0.0, // ABSTOL +// mm, // M: The total number of eigenvalues found. 0 <= M <= N. if RANGE = 'I', M = IU-IL+1. +// eigen_val, // W store eigenvalues +// eigen_vec, // Z store eigenvector +// lda, // LDZ: The leading dimension of the array Z. +// work, +// lwork, +// rwork, +// iwork, +// ifail, +// info); + +// delete[] work; +// delete[] rwork; +// delete[] iwork; +// delete[] ifail; +// } +// }; + template struct lapack_getrf { void operator()( @@ -220,4 +312,4 @@ template struct lapack_getrs, DEVICE_CPU>; template struct lapack_getrs, DEVICE_CPU>; } // namespace kernels -} // namespace container \ No newline at end of file +} // namespace container diff --git a/source/source_base/module_container/ATen/kernels/lapack.h b/source/source_base/module_container/ATen/kernels/lapack.h index dd3243ac60..5f201769ce 100644 --- a/source/source_base/module_container/ATen/kernels/lapack.h +++ b/source/source_base/module_container/ATen/kernels/lapack.h @@ -35,7 +35,7 @@ struct lapack_potrf { void operator()( const char& uplo, const int& dim, - T* Mat, + T* Mat, const int& lda); }; @@ -55,16 +55,64 @@ struct lapack_heevd { template struct lapack_hegvd { using Real = typename GetTypeReal::type; + /** + * @brief Computes all the eigenvalues and, optionally, the eigenvectors of a complex generalized Hermitian-definite eigenproblem. + * + * This function solves the problem A*x = lambda*B*x, where A and B are Hermitian matrices, and B is also positive definite. + * + * @param dim The order of the matrices Mat_A and Mat_B. dim >= 0. + * @param lda The leading dimension of the arrays Mat_A and Mat_B. lda >= max(1, dim). + * @param Mat_A On entry, the Hermitian matrix A. On exit, it may be overwritten. + * @param Mat_B On entry, the Hermitian positive definite matrix B. On exit, it may be overwritten. + * @param eigen_val Array to store the computed eigenvalues in ascending order. + * @param eigen_vec If not nullptr, array to store the computed eigenvectors. + * + * @note + * See LAPACK ZHEGVD or CHEGVD documentation for more details. + * This function assumes that A and B have the same leading dimensions, lda. + */ void operator()( - const int& itype, - const char& jobz, - const char& uplo, - T* Mat_A, - T* Mat_B, - const int& dim, - Real* eigen_val); + const int dim, + const int lda, + T *Mat_A, + T *Mat_B, + Real *eigen_val, + T *eigen_vec); }; +// template +// struct lapack_hegvx { +// using Real = typename GetTypeReal::type; +// /** +// * @ brief hegvx computes the first m eigenvalues and their corresponding eigenvectors of +// * a complex generalized Hermitian-definite eigenproblem. +// * +// * In this op, the CPU version is implemented through the `hegvx` interface, and the CUDA version +// * is implemented through the `evd` interface and acquires the first m eigenpairs +// * +// * hegvx 'V' 'I' 'U' is used to compute the first m eigenpairs of the problem +// * +// * @param n The order of the matrices A and B. n >= 0. +// * @param lda The leading dimension of the array A and B. lda >= max(1, n). +// * @param A On entry, the Hermitian matrix A. On exit, if info = 0, A contains the matrix Z of eigenvectors. +// * @param B On entry, the Hermitian positive definite matrix B. On exit, the triangular factor from the Cholesky factorization of B. +// * @param m The number of eigenvalues and eigenvectors to be found. 0 < m <= n. +// * @param eigen_val The first m eigenvalues in ascending order. +// * @param eigen_vec The first m columns contain the orthonormal eigenvectors of the matrix A corresponding to the selected eigenvalues. +// * +// * @note +// * See LAPACK ZHEGVX doc for more details. +// */ +// void operator()( +// const int n, +// const int lda, +// T *A, +// T *B, +// const int m, +// Real *eigen_val, +// T *eigen_vec); +// }; + template struct lapack_getrf { @@ -110,4 +158,4 @@ void destroyGpuSolverHandle(); // destroy cusolver handle } // namespace container } // namespace kernels -#endif // ATEN_KERNELS_LAPACK_H_ \ No newline at end of file +#endif // ATEN_KERNELS_LAPACK_H_ diff --git a/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu b/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu index 64040cb79c..93f0fa481b 100644 --- a/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu +++ b/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu @@ -28,8 +28,8 @@ void destroyGpuSolverHandle() { template __global__ void set_matrix_kernel( const char uplo, - T* A, - const int dim) + T* A, + const int dim) { int bid = blockIdx.x; int tid = threadIdx.x; @@ -64,7 +64,7 @@ struct lapack_trtri { const char& diag, const int& dim, T* Mat, - const int& lda) + const int& lda) { // TODO: trtri is not implemented in this method yet // Cause the trtri in cuSolver is not stable for ABACUS! @@ -82,8 +82,8 @@ struct lapack_potrf { void operator()( const char& uplo, const int& dim, - T* Mat, - const int& lda) + T* Mat, + const int& lda) { // hipSolverConnector::potrf(hipsolver_handle, uplo, dim, Mat, dim); std::vector H_Mat(dim * dim, static_cast(0.0)); @@ -118,15 +118,22 @@ template struct lapack_hegvd { using Real = typename GetTypeReal::type; void operator()( + const int dim, + const int lda, const int& itype, const char& jobz, const char& uplo, T* Mat_A, T* Mat_B, const int& dim, - Real* eigen_val) + Real* eigen_val, + T *eigen_vec) { - hipSolverConnector::hegvd(hipsolver_handle, itype, jobz, uplo, dim, Mat_A, dim, Mat_B, dim, eigen_val); + const int itype = 1; + const char jobz = 'V'; + const char uplo = 'U'; + hipErrcheck(hipMemcpy(eigen_vec, Mat_A, sizeof(T) * dim * lda, hipMemcpyDeviceToDevice)); + hipSolverConnector::hegvd(hipsolver_handle, itype, jobz, uplo, dim, Mat_A, lda, Mat_B, lda, eigen_val); } }; @@ -156,4 +163,4 @@ template struct lapack_hegvd, DEVICE_GPU>; template struct lapack_hegvd, DEVICE_GPU>; } // namespace kernels -} // namespace container \ No newline at end of file +} // namespace container diff --git a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp index 9675e979c5..8027d89f82 100644 --- a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp +++ b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp @@ -33,14 +33,14 @@ TYPED_TEST(LapackTest, Trtri) { Tensor A = std::move(Tensor({static_cast(1.0), static_cast(2.0), static_cast(3.0), static_cast(0.0), static_cast(4.0), static_cast(5.0), static_cast(0.0), static_cast(0.0), static_cast(6.0)}).to_device()); - + Tensor I = std::move(Tensor({static_cast(1.0), static_cast(0.0), static_cast(0.0), static_cast(0.0), static_cast(1.0), static_cast(0.0), static_cast(0.0), static_cast(0.0), static_cast(1.0)}).to_device()); Tensor B = A; Tensor C = B; C.zero(); - + const char trans = 'N'; const int m = 3; const int n = 3; @@ -51,7 +51,7 @@ TYPED_TEST(LapackTest, Trtri) { // For this reason, we should employ 'L' instead of 'U' in the subsequent line. trtriCalculator('L', 'N', dim, B.data(), dim); gemmCalculator(trans, trans, m, n, k, &alpha, B.data(), k, A.data(), n, &beta, C.data(), n); - + EXPECT_EQ(C, I); } @@ -69,11 +69,11 @@ TYPED_TEST(LapackTest, Potrf) { Tensor A = std::move(Tensor({static_cast(4.0), static_cast(1.0), static_cast(2.0), static_cast(1.0), static_cast(5.0), static_cast(3.0), static_cast(2.0), static_cast(3.0), static_cast(6.0)}).to_device()); - + Tensor B = A; Tensor C = B; C.zero(); - + const char transa = 'N'; const char transb = 'C'; const int m = 3; @@ -96,7 +96,7 @@ TYPED_TEST(LapackTest, heevd) { using Type = typename std::tuple_element<0, decltype(TypeParam())>::type; using Real = typename GetTypeReal::type; using Device = typename std::tuple_element<1, decltype(TypeParam())>::type; - + blas_gemm gemmCalculator; blas_axpy axpyCalculator; lapack_heevd heevdCalculator; @@ -105,14 +105,14 @@ TYPED_TEST(LapackTest, heevd) { Tensor A = std::move(Tensor({static_cast(4.0), static_cast(1.0), static_cast(1.0), static_cast(1.0), static_cast(5.0), static_cast(3.0), static_cast(1.0), static_cast(3.0), static_cast(6.0)}).to_device()); - + Tensor E = std::move(Tensor({static_cast(0.0), static_cast(0.0), static_cast(0.0)}).to_device()); Tensor B = A; Tensor expected_C1 = A; Tensor expected_C2 = A; expected_C1.zero(); expected_C2.zero(); - + const char trans = 'N'; const int m = 3; const int n = 3; @@ -122,10 +122,10 @@ TYPED_TEST(LapackTest, heevd) { // Note all blas and lapack operators within container are column major! // For this reason, we should employ 'L' instead of 'U' in the subsequent line. heevdCalculator('V', 'U', B.data(), dim, E.data()); - + E = E.to_device(); const Tensor Alpha = std::move(Tensor({ - static_cast(E.data()[0]), + static_cast(E.data()[0]), static_cast(E.data()[1]), static_cast(E.data()[2])})); @@ -143,7 +143,7 @@ TYPED_TEST(LapackTest, hegvd) { using Type = typename std::tuple_element<0, decltype(TypeParam())>::type; using Real = typename GetTypeReal::type; using Device = typename std::tuple_element<1, decltype(TypeParam())>::type; - + blas_gemm gemmCalculator; blas_axpy axpyCalculator; lapack_hegvd hegvdCalculator; @@ -152,31 +152,31 @@ TYPED_TEST(LapackTest, hegvd) { Tensor A = std::move(Tensor({static_cast(4.0), static_cast(1.0), static_cast(1.0), static_cast(1.0), static_cast(5.0), static_cast(3.0), static_cast(1.0), static_cast(3.0), static_cast(6.0)}).to_device()); - + Tensor I = std::move(Tensor({static_cast(1.0), static_cast(0.0), static_cast(0.0), static_cast(0.0), static_cast(1.0), static_cast(0.0), static_cast(0.0), static_cast(0.0), static_cast(1.0)}).to_device()); - + Tensor E = std::move(Tensor({static_cast(0.0), static_cast(0.0), static_cast(0.0)}).to_device()); Tensor B = A; Tensor expected_C1 = A; Tensor expected_C2 = A; expected_C1.zero(); expected_C2.zero(); - + const char trans = 'N'; const int m = 3; const int n = 3; const int k = 3; const Type alpha = static_cast(1.0); const Type beta = static_cast(0.0); - // Note al(), I.data(), dim, E.data()); + hegvdCalculator(dim, dim, A.data(), I.data(), E.data(), B.data()); E = E.to_device(); const Tensor Alpha = std::move(Tensor({ - static_cast(E.data()[0]), + static_cast(E.data()[0]), static_cast(E.data()[1]), static_cast(E.data()[2])})); diff --git a/source/source_base/module_container/base/third_party/cusolver.h b/source/source_base/module_container/base/third_party/cusolver.h index d65aa5a204..6bd137d718 100644 --- a/source/source_base/module_container/base/third_party/cusolver.h +++ b/source/source_base/module_container/base/third_party/cusolver.h @@ -145,14 +145,14 @@ static inline void heevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, float* A, const int& lda, float * W) { // prepare some values for cusolverDnSsyevd_bufferSize - int lwork = 0; - int h_info = 0; + int lwork = 0; + int h_info = 0; int* d_info = nullptr; float* d_work = nullptr; cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); // calculate the sizes needed for pre-allocated buffer. - cusolverErrcheck(cusolverDnSsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), + cusolverErrcheck(cusolverDnSsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, A, lda, W, &lwork)); // allocate memory cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork)); @@ -171,14 +171,14 @@ static inline void heevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, double* A, const int& lda, double * W) { // prepare some values for cusolverDnDsyevd_bufferSize - int lwork = 0; - int h_info = 0; + int lwork = 0; + int h_info = 0; int* d_info = nullptr; double* d_work = nullptr; cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); // calculate the sizes needed for pre-allocated buffer. - cusolverErrcheck(cusolverDnDsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), + cusolverErrcheck(cusolverDnDsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, A, lda, W, &lwork)); // allocate memory cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork)); @@ -197,14 +197,14 @@ static inline void heevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, std::complex* A, const int& lda, float * W) { // prepare some values for cusolverDnCheevd_bufferSize - int lwork = 0; - int h_info = 0; + int lwork = 0; + int h_info = 0; int* d_info = nullptr; cuComplex* d_work = nullptr; cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); // calculate the sizes needed for pre-allocated buffer. - cusolverErrcheck(cusolverDnCheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), + cusolverErrcheck(cusolverDnCheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, reinterpret_cast(A), lda, W, &lwork)); // allocate memory cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork)); @@ -223,14 +223,14 @@ static inline void heevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, std::complex* A, const int& lda, double* W) { // prepare some values for cusolverDnZheevd_bufferSize - int lwork = 0; - int h_info = 0; + int lwork = 0; + int h_info = 0; int* d_info = nullptr; cuDoubleComplex* d_work = nullptr; cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); // calculate the sizes needed for pre-allocated buffer. - cusolverErrcheck(cusolverDnZheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), + cusolverErrcheck(cusolverDnZheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, reinterpret_cast(A), lda, W, &lwork)); // allocate memory cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork)); @@ -250,19 +250,19 @@ static inline void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, float* A, const int& lda, float* B, const int& ldb, float * W) { // prepare some values for cusolverDnSsygvd_bufferSize - int lwork = 0; - int h_info = 0; + int lwork = 0; + int h_info = 0; int* d_info = nullptr; float* d_work = nullptr; cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); // calculate the sizes needed for pre-allocated buffer. - cusolverErrcheck(cusolverDnSsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), + cusolverErrcheck(cusolverDnSsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, A, lda, B, ldb, W, &lwork)); // allocate memory cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork)); // compute eigenvalues and eigenvectors. - cusolverErrcheck(cusolverDnSsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), + cusolverErrcheck(cusolverDnSsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, A, lda, B, ldb, W, d_work, lwork, d_info)); cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); @@ -276,19 +276,19 @@ static inline void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, double* A, const int& lda, double* B, const int& ldb, double * W) { // prepare some values for cusolverDnDsygvd_bufferSize - int lwork = 0; - int h_info = 0; + int lwork = 0; + int h_info = 0; int* d_info = nullptr; double* d_work = nullptr; cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); // calculate the sizes needed for pre-allocated buffer. - cusolverErrcheck(cusolverDnDsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), + cusolverErrcheck(cusolverDnDsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, A, lda, B, ldb, W, &lwork)); // allocate memory cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork)); // compute eigenvalues and eigenvectors. - cusolverErrcheck(cusolverDnDsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), + cusolverErrcheck(cusolverDnDsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, A, lda, B, ldb, W, d_work, lwork, d_info)); cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); @@ -302,19 +302,19 @@ static inline void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, std::complex* A, const int& lda, std::complex* B, const int& ldb, float* W) { // prepare some values for cusolverDnChegvd_bufferSize - int lwork = 0; - int h_info = 0; + int lwork = 0; + int h_info = 0; int* d_info = nullptr; cuComplex* d_work = nullptr; cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); // calculate the sizes needed for pre-allocated buffer. - cusolverErrcheck(cusolverDnChegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), + cusolverErrcheck(cusolverDnChegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, reinterpret_cast(A), lda, reinterpret_cast(B), ldb, W, &lwork)); // allocate memory cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork)); // compute eigenvalues and eigenvectors. - cusolverErrcheck(cusolverDnChegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), + cusolverErrcheck(cusolverDnChegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, reinterpret_cast(A), lda, reinterpret_cast(B), ldb, W, d_work, lwork, d_info)); cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); @@ -328,19 +328,19 @@ static inline void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, std::complex* A, const int& lda, std::complex* B, const int& ldb, double* W) { // prepare some values for cusolverDnZhegvd_bufferSize - int lwork = 0; - int h_info = 0; + int lwork = 0; + int h_info = 0; int* d_info = nullptr; cuDoubleComplex* d_work = nullptr; cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); // calculate the sizes needed for pre-allocated buffer. - cusolverErrcheck(cusolverDnZhegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), + cusolverErrcheck(cusolverDnZhegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, reinterpret_cast(A), lda, reinterpret_cast(B), ldb, W, &lwork)); // allocate memory cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork)); // compute eigenvalues and eigenvectors. - cusolverErrcheck(cusolverDnZhegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), + cusolverErrcheck(cusolverDnZhegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), n, reinterpret_cast(A), lda, reinterpret_cast(B), ldb, W, d_work, lwork, d_info)); cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); @@ -351,6 +351,208 @@ void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& j cudaErrcheck(cudaFree(d_work)); } +// ----------------------------- +// CUDA hegvx implementations +// ----------------------------- + +// static inline +// void hegvx(cusolverDnHandle_t& cusolver_handle, +// const int& itype, +// const char& jobz, +// const char& uplo, +// const int& n, +// float* A, +// const int& lda, +// float* B, +// const int& ldb, +// const int& il, +// const int& iu, +// const float& abstol, +// int& m_out, +// float* W, +// float* Z, +// const int& ldz) { +// // Step 1: Query workspace size +// int lwork = 0; +// cusolverErrcheck(cusolverDnSsygvdx_bufferSize( +// cusolver_handle, +// cublas_eig_type(itype), // itype +// cublas_eig_mode(jobz), // jobz +// cublas_fill_mode(uplo), // uplo +// n, A, lda, B, ldb, +// il, iu, // range: index interval +// abstol, // tolerance +// &m_out, // output: number of eigenvalues found +// W, // eigenvalues +// Z, ldz, // eigenvectors +// &lwork)); // workspace size + +// // Step 2: Allocate workspace and info +// float* d_work = nullptr; +// int* d_info = nullptr; +// cudaErrcheck(cudaMalloc(&d_work, sizeof(float) * lwork)); +// cudaErrcheck(cudaMalloc(&d_info, sizeof(int))); + +// // Step 3: Call the solver +// cusolverErrcheck(cusolverDnSsygvdx( +// cusolver_handle, +// cublas_eig_type(itype), +// cublas_eig_mode(jobz), +// cublas_fill_mode(uplo), +// n, A, lda, B, ldb, +// il, iu, abstol, &m_out, W, Z, ldz, d_work, lwork, d_info)); + +// // Step 4: Check result +// int h_info = 0; +// cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); +// if (h_info != 0) { +// throw std::runtime_error("hegvx: cusolverDnSsygvdx failed with info = " + std::to_string(h_info)); +// } + +// // Cleanup +// cudaErrcheck(cudaFree(d_work)); +// cudaErrcheck(cudaFree(d_info)); +// } + +// Double precision +// static inline +// void hegvx(cusolverDnHandle_t& cusolver_handle, +// const int& itype, +// const char& jobz, +// const char& uplo, +// const int& n, +// double* A, +// const int& lda, +// double* B, +// const int& ldb, +// const int& il, +// const int& iu, +// const double& abstol, +// int& m_out, +// double* W, +// double* Z, +// const int& ldz) { +// int lwork = 0; +// cusolverErrcheck(cusolverDnDsygvdx_bufferSize( +// cusolver_handle, itype, jobz, uplo, n, A, lda, B, ldb, +// il, iu, abstol, &m_out, W, Z, ldz, &lwork)); + +// double* d_work = nullptr; +// int* d_info = nullptr; +// cudaErrcheck(cudaMalloc(&d_work, sizeof(double) * lwork)); +// cudaErrcheck(cudaMalloc(&d_info, sizeof(int))); + +// cusolverErrcheck(cusolverDnDsygvdx( +// cusolver_handle, itype, jobz, uplo, n, A, lda, B, ldb, +// il, iu, abstol, &m_out, W, Z, ldz, d_work, lwork, d_info)); + +// int h_info = 0; +// cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); +// if (h_info != 0) { +// throw std::runtime_error("hegvx: cusolverDnDsygvdx failed with info = " + std::to_string(h_info)); +// } + +// cudaErrcheck(cudaFree(d_work)); +// cudaErrcheck(cudaFree(d_info)); +// } + +// // Complex single precision +// static inline +// void hegvx(cusolverDnHandle_t& cusolver_handle, +// const int& itype, +// const char& jobz, +// const char& uplo, +// const int& n, +// cuComplex* A, +// const int& lda, +// cuComplex* B, +// const int& ldb, +// const int& il, +// const int& iu, +// const float& abstol, +// int& m_out, +// float* W, +// cuComplex* Z, +// const int& ldz) { +// int lwork = 0; +// cusolverErrcheck(cusolverDnChegvdx_bufferSize( +// cusolver_handle, itype, jobz, uplo, n, +// reinterpret_cast(A), lda, +// reinterpret_cast(B), ldb, +// il, iu, abstol, &m_out, W, +// reinterpret_cast(Z), ldz, &lwork)); + +// cuComplex* d_work = nullptr; +// int* d_info = nullptr; +// cudaErrcheck(cudaMalloc(&d_work, sizeof(cuComplex) * lwork)); +// cudaErrcheck(cudaMalloc(&d_info, sizeof(int))); + +// cusolverErrcheck(cusolverDnChegvdx( +// cusolver_handle, itype, jobz, uplo, n, +// reinterpret_cast(A), lda, +// reinterpret_cast(B), ldb, +// il, iu, abstol, &m_out, W, +// reinterpret_cast(Z), ldz, d_work, lwork, d_info)); + +// int h_info = 0; +// cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); +// if (h_info != 0) { +// throw std::runtime_error("hegvx: cusolverDnChegvdx failed with info = " + std::to_string(h_info)); +// } + +// cudaErrcheck(cudaFree(d_work)); +// cudaErrcheck(cudaFree(d_info)); +// } + +// // Complex double precision +// static inline +// void hegvx(cusolverDnHandle_t& cusolver_handle, +// const int& itype, +// const char& jobz, +// const char& uplo, +// const int& n, +// cuDoubleComplex* A, +// const int& lda, +// cuDoubleComplex* B, +// const int& ldb, +// const int& il, +// const int& iu, +// const double& abstol, +// int& m_out, +// double* W, +// cuDoubleComplex* Z, +// const int& ldz) { +// int lwork = 0; +// cusolverErrcheck(cusolverDnZhegvdx_bufferSize( +// cusolver_handle, itype, jobz, uplo, n, +// reinterpret_cast(A), lda, +// reinterpret_cast(B), ldb, +// il, iu, abstol, &m_out, W, +// reinterpret_cast(Z), ldz, &lwork)); + +// cuDoubleComplex* d_work = nullptr; +// int* d_info = nullptr; +// cudaErrcheck(cudaMalloc(&d_work, sizeof(cuDoubleComplex) * lwork)); +// cudaErrcheck(cudaMalloc(&d_info, sizeof(int))); + +// cusolverErrcheck(cusolverDnZhegvdx( +// cusolver_handle, itype, jobz, uplo, n, +// reinterpret_cast(A), lda, +// reinterpret_cast(B), ldb, +// il, iu, abstol, &m_out, W, +// reinterpret_cast(Z), ldz, d_work, lwork, d_info)); + +// int h_info = 0; +// cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); +// if (h_info != 0) { +// throw std::runtime_error("hegvx: cusolverDnZhegvdx failed with info = " + std::to_string(h_info)); +// } + +// cudaErrcheck(cudaFree(d_work)); +// cudaErrcheck(cudaFree(d_info)); +// } + +// --- getrf static inline void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, float* A, const int& lda, int* ipiv) { @@ -528,4 +730,4 @@ void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n, } // namespace cuSolverConnector } // namespace container -#endif // BASE_THIRD_PARTY_CUSOLVER_H_ \ No newline at end of file +#endif // BASE_THIRD_PARTY_CUSOLVER_H_ diff --git a/source/source_hsolver/kernels/hegvd_op.cpp b/source/source_hsolver/kernels/hegvd_op.cpp index ab7c520c3d..0959338005 100644 --- a/source/source_hsolver/kernels/hegvd_op.cpp +++ b/source/source_hsolver/kernels/hegvd_op.cpp @@ -1,7 +1,6 @@ #include "source_hsolver/kernels/hegvd_op.h" #include "source_base/module_container/base/third_party/lapack.h" -#include #include #include @@ -146,7 +145,7 @@ struct hegvd_op /** * @brief heevx computes the first m eigenvalues and their corresponding eigenvectors of * a complex generalized Hermitian-definite eigenproblem. - * + * * both heevx and syevx are implemented through the `evx` interface of LAPACK. * wrapped in LapackWrapper::xheevx */ @@ -349,4 +348,4 @@ template struct heevx_op; template struct hegvx_op; // template struct hegv_op; #endif -} // namespace hsolver \ No newline at end of file +} // namespace hsolver From 39bbfe4fbeaea6dc39ee84b4a68ca6ff04fa0e03 Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Fri, 17 Oct 2025 14:18:19 +0800 Subject: [PATCH 02/17] Replace hegvd_op with lapack_hegvd in diago_dav_subspace --- source/source_hsolver/diago_dav_subspace.cpp | 9 ++-- source/source_hsolver/diago_dav_subspace.h | 8 +++- source/source_hsolver/diago_david.h | 43 +++++++++++--------- 3 files changed, 36 insertions(+), 24 deletions(-) diff --git a/source/source_hsolver/diago_dav_subspace.cpp b/source/source_hsolver/diago_dav_subspace.cpp index 111ade3e43..5a525bd78d 100644 --- a/source/source_hsolver/diago_dav_subspace.cpp +++ b/source/source_hsolver/diago_dav_subspace.cpp @@ -4,12 +4,15 @@ #include "source_base/module_device/device.h" #include "source_base/timer.h" -#include "source_hsolver/kernels/hegvd_op.h" #include "source_base/kernels/math_kernel_op.h" -#include "source_hsolver/kernels/bpcg_kernel_op.h" // normalize_op, precondition_op, apply_eigenvalues_op #include "source_base/kernels/dsp/dsp_connector.h" +// #include "source_base/module_container/ATen/kernels/lapack.h" + +#include +#include "source_hsolver/kernels/hegvd_op.h" #include "source_hsolver/diag_hs_para.h" +#include "source_hsolver/kernels/bpcg_kernel_op.h" // normalize_op, precondition_op, apply_eigenvalues_op #include @@ -540,7 +543,7 @@ void Diago_DavSubspace::diag_zhegvx(const int& nbase, if (this->diag_comm.rank == 0) { syncmem_complex_op()(this->d_scc, scc, nbase * this->nbase_x); - hegvd_op()(this->ctx, nbase, this->nbase_x, this->hcc, this->d_scc, this->d_eigenvalue, this->vcc); + ct::kernels::lapack_hegvd()(nbase, this->nbase_x, this->hcc, this->d_scc, this->d_eigenvalue, this->vcc); syncmem_var_d2h_op()((*eigenvalue_iter).data(), this->d_eigenvalue, this->nbase_x); } #endif diff --git a/source/source_hsolver/diago_dav_subspace.h b/source/source_hsolver/diago_dav_subspace.h index 53bcb3e57b..6ad3300b18 100644 --- a/source/source_hsolver/diago_dav_subspace.h +++ b/source/source_hsolver/diago_dav_subspace.h @@ -5,6 +5,8 @@ #include "source_base/module_device/device.h" // base_device #include "source_base/module_device/memory_op.h"// base_device::memory" +#include "source_base/module_container/ATen/kernels/lapack.h" + #include "source_hsolver/diag_comm_info.h" #include "source_hsolver/diag_const_nums.h" @@ -189,10 +191,14 @@ class Diago_DavSubspace using syncmem_h2d_op = base_device::memory::synchronize_memory_op; using syncmem_d2h_op = base_device::memory::synchronize_memory_op; + // Note that ct_Device is different from base_device! + using ct_Device = typename ct::PsiToContainer::type; + // using hegvd_op = container::kernels::lapack_hegvd; + const T *one = nullptr, *zero = nullptr, *neg_one = nullptr; const T one_ = static_cast(1.0), zero_ = static_cast(0.0), neg_one_ = static_cast(-1.0); }; } // namespace hsolver -#endif \ No newline at end of file +#endif diff --git a/source/source_hsolver/diago_david.h b/source/source_hsolver/diago_david.h index 5b65f22cec..02d2f46697 100644 --- a/source/source_hsolver/diago_david.h +++ b/source/source_hsolver/diago_david.h @@ -5,7 +5,10 @@ #include "source_base/module_device/device.h" // base_device #include "source_base/module_device/memory_op.h"// base_device::memory +// #include "source_base/module_container/ATen/kernels/lapack.h" // container::kernels + #include "source_hsolver/diag_comm_info.h" +#include "source_hsolver/kernels/hegvd_op.h" #include #include @@ -26,16 +29,16 @@ template , typename Device = base_device::DEVI class DiagoDavid { private: - // Note GetTypeReal::type will - // return T if T is real type(float, double), + // Note GetTypeReal::type will + // return T if T is real type(float, double), // otherwise return the real type of T(complex, std::complex) using Real = typename GetTypeReal::type; - + public: /** * @brief Constructor for the DiagoDavid class. - * + * * @param[in] precondition_in Pointer to the preconditioning matrix. * @param[in] nband_in Number of eigenpairs required(i.e. bands). * @param[in] dim_in Dimension of the matrix. @@ -44,10 +47,10 @@ class DiagoDavid * the reduced basis set before \b restart of Davidson. * @param[in] use_paw_in Flag indicating whether to use PAW. * @param[in] diag_comm_in Communication information for diagonalization. - * + * * @tparam T The data type of the matrices and arrays. * @tparam Device The device type (base_device::DEVICE_CPU or DEVICE_GPU). - * + * * @note Auxiliary memory is allocated in the constructor and deallocated in the destructor. */ DiagoDavid(const Real* precondition_in, @@ -59,10 +62,10 @@ class DiagoDavid /** * @brief Destructor for the DiagoDavid class. - * + * * This destructor releases the dynamically allocated memory used by the class members. * It deletes the basis, hpsi, spsi, hcc, vcc, lagrange_matrix, and eigenvalue arrays. - * + * */ ~DiagoDavid(); @@ -75,7 +78,7 @@ class DiagoDavid * This function type is used to define a matrix-blockvector operator H. * For eigenvalue problem HX = λX or generalized eigenvalue problem HX = λSX, * this function computes the product of the Hamiltonian matrix H and a blockvector X. - * + * * Called as follows: * hpsi(X, HX, ld, nvec) where X and HX are (ld, nvec)-shaped blockvectors. * Result HX = H * X is stored in HX. @@ -84,7 +87,7 @@ class DiagoDavid * @param[in] HX Head address of output blockvector of type `T*`. * @param[in] ld Leading dimension of blockvector. * @param[in] nvec Number of vectors in a block. - * + * * @warning X and HX are the exact address to read input X and store output H*X, * @warning both of size ld * nvec. */ @@ -92,7 +95,7 @@ class DiagoDavid /** * @brief A function type representing the SX function. - * + * * nrow is leading dimension of spsi, npw is leading dimension of psi, nbands is number of vecs * * This function type is used to define a matrix-blockvector operator S. @@ -108,9 +111,9 @@ class DiagoDavid /** * @brief Performs iterative diagonalization using the David algorithm. - * + * * @warning Please see docs of `HPsiFunc` for more information about the hpsi mat-vec interface. - * + * * @tparam T The type of the elements in the matrix. * @tparam Device The device type (CPU or GPU). * @param hpsi_func The function object that computes the matrix-blockvector product H * psi. @@ -123,13 +126,13 @@ class DiagoDavid * @param ntry_max The maximum number of attempts for the diagonalization restart. * @param notconv_max The maximum number of bands unconverged allowed. * @return The total number of iterations performed during the diagonalization. - * + * * @note ntry_max is an empirical parameter that should be specified in external routine, default 5 * notconv_max is determined by the accuracy required for the calculation, default 0 */ int diag( - const HPsiFunc& hpsi_func, // function void hpsi(T*, T*, const int, const int) - const SPsiFunc& spsi_func, // function void spsi(T*, T*, const int, const int, const int) + const HPsiFunc& hpsi_func, // function void hpsi(T*, T*, const int, const int) + const SPsiFunc& spsi_func, // function void spsi(T*, T*, const int, const int, const int) const int ld_psi, // Leading dimension of the psi input T *psi_in, // Pointer to eigenvectors Real* eigenvalue_in, // Pointer to store the resulting eigenvalues @@ -218,7 +221,7 @@ class DiagoDavid /** * Calculates the elements of the diagonalization matrix for the DiagoDavid class. - * + * * @param dim The dimension of the problem. * @param nbase The current dimension of the reduced basis. * @param nbase_x The maximum dimension of the reduced basis set. @@ -237,7 +240,7 @@ class DiagoDavid /** * Refreshes the diagonalization solver by updating the basis and the reduced Hamiltonian. - * + * * @param dim The dimension of the problem. * @param nband The number of bands. * @param nbase The number of basis states. @@ -249,7 +252,7 @@ class DiagoDavid * @param spsi Pointer to the output array for the updated basis set (nband-th column). * @param hcc Pointer to the output array for the updated reduced Hamiltonian. * @param vcc Pointer to the output array for the updated eigenvector matrix. - * + * */ void refresh(const int& dim, const int& nband, @@ -286,7 +289,7 @@ class DiagoDavid /** * @brief Plans the Schmidt orthogonalization for a given number of bands. - * + * * @tparam T The type of the elements in the vectors. * @tparam Device The device on which the computation will be performed. * @param nband The number of bands. From f98a920d36e7eadfd3e6ed9cc5a503807fb512f0 Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Fri, 17 Oct 2025 17:20:20 +0800 Subject: [PATCH 03/17] Remove itype parameter for heevx that is required only in gv --- .../base/third_party/lapack.h | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/source/source_base/module_container/base/third_party/lapack.h b/source/source_base/module_container/base/third_party/lapack.h index 0117734993..6e272d2b29 100644 --- a/source/source_base/module_container/base/third_party/lapack.h +++ b/source/source_base/module_container/base/third_party/lapack.h @@ -3,9 +3,9 @@ * @brief This is a direct wrapper of some LAPACK routines. * \b Column-Major version. * Direct wrapping of standard LAPACK routines. (Column-Major, fortran style) - * + * * @warning For Row-major version, please refer to \c source/source_base/module_external/lapack_connector.h. - * + * * @note * Some slight modification are made to fit the C++ style for overloading purpose. * You can find some function with different parameter list than the original LAPACK routine. @@ -294,7 +294,7 @@ void hegvx(const int itype, const char jobz, const char range, const char uplo, // wrap function of fortran lapack routine zheevx. static inline -void heevx( const int itype, const char jobz, const char range, const char uplo, const int n, +void heevx(const char jobz, const char range, const char uplo, const int n, float* a, const int lda, const float vl, const float vu, const int il, const int iu, const float abstol, const int m, float* w, float* z, const int ldz, @@ -307,11 +307,11 @@ void heevx( const int itype, const char jobz, const char range, const char uplo, } // wrap function of fortran lapack routine zheevx. static inline -void heevx( const int itype, const char jobz, const char range, const char uplo, const int n, - double* a, const int lda, - const double vl, const double vu, const int il, const int iu, const double abstol, - const int m, double* w, double* z, const int ldz, - double* work, const int lwork, double* rwork, int* iwork, int* ifail, int info) +void heevx(const char jobz, const char range, const char uplo, const int n, + double* a, const int lda, + const double vl, const double vu, const int il, const int iu, const double abstol, + const int m, double* w, double* z, const int ldz, + double* work, const int lwork, double* rwork, int* iwork, int* ifail, int info) { dsyevx_(&jobz, &range, &uplo, &n, a, &lda, &vl, &vu, &il, &iu, @@ -319,7 +319,7 @@ void heevx( const int itype, const char jobz, const char range, const char uplo, work, &lwork, rwork, iwork, ifail, &info); } static inline -void heevx( const int itype, const char jobz, const char range, const char uplo, const int n, +void heevx(const char jobz, const char range, const char uplo, const int n, std::complex* a, const int lda, const float vl, const float vu, const int il, const int iu, const float abstol, const int m, float* w, std::complex* z, const int ldz, @@ -332,7 +332,7 @@ void heevx( const int itype, const char jobz, const char range, const char uplo, } // wrap function of fortran lapack routine zheevx. static inline -void heevx( const int itype, const char jobz, const char range, const char uplo, const int n, +void heevx(const char jobz, const char range, const char uplo, const int n, std::complex* a, const int lda, const double vl, const double vu, const int il, const int iu, const double abstol, const int m, double* w, std::complex* z, const int ldz, @@ -399,17 +399,17 @@ static inline void potrf( const char &uplo, const int &n, float* A, const int &lda, int &info ) { spotrf_(&uplo, &n, A, &lda, &info ); -} +} static inline void potrf( const char &uplo, const int &n, double* A, const int &lda, int &info ) { dpotrf_(&uplo, &n, A, &lda, &info ); -} +} static inline void potrf( const char &uplo, const int &n, std::complex* A, const int &lda, int &info ) { cpotrf_(&uplo, &n, A, &lda, &info ); -} +} static inline void potrf( const char &uplo, const int &n, std::complex* A, const int &lda, int &info ) { From 679286e99d86fc0abb4fc6abe8c6368466b1c8e7 Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Fri, 17 Oct 2025 17:21:21 +0800 Subject: [PATCH 04/17] Update heevx to use correct arg list --- source/source_hsolver/kernels/hegvd_op.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/source/source_hsolver/kernels/hegvd_op.cpp b/source/source_hsolver/kernels/hegvd_op.cpp index 0959338005..aa9d56239b 100644 --- a/source/source_hsolver/kernels/hegvd_op.cpp +++ b/source/source_hsolver/kernels/hegvd_op.cpp @@ -177,7 +177,6 @@ struct heevx_op // When lwork = -1, the demension of work will be assumed // Assume the denmension of work by output work[0] lapackConnector::heevx( - 1, // ITYPE = 1: A*x = (lambda)*B*x 'V', // JOBZ = 'V': Compute eigenvalues and eigenvectors. 'I', // RANGE = 'I': the IL-th through IU-th eigenvalues will be found. 'L', // UPLO = 'L': Lower triangles of A and B are stored. @@ -211,7 +210,6 @@ struct heevx_op // obtained by the zhegvx operation is (nstart * nstart) and stored in zux (internal to the function). When // the function is output, the data of zux will be mapped to the corresponding position of V. lapackConnector::heevx( - 1, // ITYPE = 1: A*x = (lambda)*B*x 'V', // JOBZ = 'V': Compute eigenvalues and eigenvectors. 'I', // RANGE = 'I': the IL-th through IU-th eigenvalues will be found. 'L', // UPLO = 'L': Lower triangles of A and B are stored. From 82b8acf93964178410be0532843bbda0a35193cd Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Sat, 18 Oct 2025 00:20:05 +0800 Subject: [PATCH 05/17] Add lapack_heevx --- .../ATen/kernels/cuda/lapack.cu | 33 +++ .../module_container/ATen/kernels/lapack.cpp | 97 +++++++ .../module_container/ATen/kernels/lapack.h | 41 ++- .../module_container/base/macros/cuda.h | 25 +- .../base/third_party/cusolver.h | 242 ++++++++++++++++++ 5 files changed, 434 insertions(+), 4 deletions(-) diff --git a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu index 33e1eef7f5..b72ffbb3c0 100644 --- a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu +++ b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu @@ -101,6 +101,39 @@ struct lapack_heevd { } }; +template +struct lapack_heevx { + using Real = typename GetTypeReal::type; + void operator()( + const int n, + const int lda, + T *d_Mat, + const int neig, + Real *d_eigen_val, + T *d_eigen_vec) + { + // copy d_Mat to d_eigen_vec, and results will be overwritten into d_eigen_vec + // by cuSolver + cudaErrcheck(cudaMemcpy(d_eigen_vec, d_Mat, sizeof(T) * n * lda, cudaMemcpyDeviceToDevice)); + + int meig = 0; + + cuSolverConnector::heevdx( + cusolver_handle, + n, + lda, + d_eigen_vec, + 'V', // jobz: compute vectors + 'L', // uplo: lower triangle + 'I', // range: by index + 1, neig, // il, iu + Real(0), Real(0), // vl, vu (unused) + d_eigen_val, + &meig + ); + + } +}; template struct lapack_hegvd { using Real = typename GetTypeReal::type; diff --git a/source/source_base/module_container/ATen/kernels/lapack.cpp b/source/source_base/module_container/ATen/kernels/lapack.cpp index 90a3c5a0b6..3780caadf6 100644 --- a/source/source_base/module_container/ATen/kernels/lapack.cpp +++ b/source/source_base/module_container/ATen/kernels/lapack.cpp @@ -4,10 +4,17 @@ // #include // std::memcpy #include // std::copy +#include +#include namespace container { namespace kernels { +inline double get_real(const std::complex &x) { return x.real(); } +inline float get_real(const std::complex &x) { return x.real(); } +inline double get_real(const double &x) { return x; } +inline float get_real(const float &x) { return x; } + template struct set_matrix { void operator() ( @@ -95,6 +102,96 @@ struct lapack_heevd { } }; +template +struct lapack_heevx { + using Real = typename GetTypeReal::type; + void operator()( + const int n, + const int lda, + T *Mat, + const int neig, + Real *eigen_val, + T *eigen_vec) + { + Tensor aux(DataTypeToEnum::value, DeviceType::CpuDevice, {n * lda}); + // Copy Mat to aux since heevx will destroy it + // aux = Mat + std::copy(Mat, Mat + n * lda, aux); + + char jobz = 'V'; // Compute eigenvalues and eigenvectors + char range = 'I'; // Find eigenvalues in index range [il, iu] + char uplo = 'L'; // Use Lower triangle + int info = 0; + int found = 0; // Number of eigenvalues found + // found should be iu - il + 1, i.e. found = neig + const int il = 1; + const int iu = neig; + Real abstol = 0.0; + + // Workspace query first + int lwork = -1; + T work_query; + Real rwork_query; + int iwork_query; + int ifail_query; + + // Dummy call to get optimal workspace size + // when lwork = -1 + lapackConnector::heevx( + jobz, range, uplo, n, + aux, lda, + 0.0, 0.0, il, iu, // vl, vu not used when range='I' + abstol, + &found, + eigen_val, + eigen_vec, lda, + &work_query, lwork, + &rwork_query, + &iwork_query, + &ifail_query, + &info); + + if (info != 0) { + throw std::runtime_error("heevx workspace query failed with info = " + std::to_string(info)); + } + + lwork = static_cast(get_real(work_query)); + + // Allocate buffers using Tensor (RAII) + Tensor work(DataTypeToEnum::value, DeviceType::CpuDevice, {lwork}); + work.zero(); + + Tensor rwork(DataTypeToEnum::value, DeviceType::CpuDevice, {7 * n}); + rwork.zero(); + + Tensor iwork(DataType::DT_INT, DeviceType::CpuDevice, {5 * n}); + iwork.zero(); + + Tensor ifail(DataType::DT_INT, DeviceType::CpuDevice, {n}); + ifail.zero(); + + // Actual call to heevx + lapackConnector::heevx( + jobz, range, uplo, n, + aux, lda, + 0.0, 0.0, il, iu, + abstol, + &found, + eigen_val, + eigen_vec, lda, + work.data(), lwork, + rwork.data(), + iwork.data(), + ifail.data(), + &info); + + if (info != 0) { + throw std::runtime_error("heevx failed with info = " + std::to_string(info)); + } + + } +}; + template struct lapack_hegvd { using Real = typename GetTypeReal::type; diff --git a/source/source_base/module_container/ATen/kernels/lapack.h b/source/source_base/module_container/ATen/kernels/lapack.h index 5f201769ce..8ec3bc73e1 100644 --- a/source/source_base/module_container/ATen/kernels/lapack.h +++ b/source/source_base/module_container/ATen/kernels/lapack.h @@ -1,6 +1,7 @@ #ifndef ATEN_KERNELS_LAPACK_H_ #define ATEN_KERNELS_LAPACK_H_ +#include "source_base/macros.h" #include #include @@ -51,6 +52,40 @@ struct lapack_heevd { Real* eigen_val); }; +template +struct lapack_heevx { + using Real = typename GetTypeReal::type; + /** + * @brief Computes selected eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix. + * + * This function solves the problem A*x = lambda*x, where A is a Hermitian matrix. + * It computes a subset of eigenvalues and, optionally, the corresponding eigenvectors. + * + * @param jobz 'N': Compute eigenvalues only; 'V': Compute eigenvalues and eigenvectors. + * @param range 'A': All eigenvalues; 'V': Eigenvalues in the half-open interval (vl, vu]; 'I': Eigenvalues with indices il through iu. + * @param uplo 'U': Upper triangle of A is stored; 'L': Lower triangle is stored. + * @param dim The order of the matrix A. dim >= 0. + * @param Mat On entry, the Hermitian matrix A. On exit, it may be overwritten. + * @param vl Lower bound of the interval to search for eigenvalues if range == 'V'. + * @param vu Upper bound of the interval to search for eigenvalues if range == 'V'. + * @param il Index of the smallest eigenvalue to be returned if range == 'I'. + * @param iu Index of the largest eigenvalue to be returned if range == 'I'. + * @param m Output: The total number of found eigenvalues. + * @param eigen_val Array to store the computed eigenvalues in ascending order. + * @param eigen_vec If not nullptr and jobz == 'V', array to store the computed eigenvectors. + * + * @note + * See LAPACK ZHEEVX or CHEEVX documentation for more details. + * + */ + void operator()( + const int dim, + const int lda, + T *Mat, + const int neig, + Real *eigen_val, + T *eigen_vec); +}; template struct lapack_hegvd { @@ -60,8 +95,8 @@ struct lapack_hegvd { * * This function solves the problem A*x = lambda*B*x, where A and B are Hermitian matrices, and B is also positive definite. * - * @param dim The order of the matrices Mat_A and Mat_B. dim >= 0. - * @param lda The leading dimension of the arrays Mat_A and Mat_B. lda >= max(1, dim). + * @param n The order of the matrices Mat_A and Mat_B. n >= 0. + * @param lda The leading dimension of the arrays Mat_A and Mat_B. lda >= max(1, n). * @param Mat_A On entry, the Hermitian matrix A. On exit, it may be overwritten. * @param Mat_B On entry, the Hermitian positive definite matrix B. On exit, it may be overwritten. * @param eigen_val Array to store the computed eigenvalues in ascending order. @@ -72,7 +107,7 @@ struct lapack_hegvd { * This function assumes that A and B have the same leading dimensions, lda. */ void operator()( - const int dim, + const int n, const int lda, T *Mat_A, T *Mat_B, diff --git a/source/source_base/module_container/base/macros/cuda.h b/source/source_base/module_container/base/macros/cuda.h index ac5ced387a..e9fb2866e7 100644 --- a/source/source_base/module_container/base/macros/cuda.h +++ b/source/source_base/module_container/base/macros/cuda.h @@ -121,6 +121,29 @@ static inline cusolverEigType_t cublas_eig_type(const int& itype) throw std::runtime_error("cublas_eig_mode: unknown diag"); } +/** + * @brief Converts a character specifying eigenvalue range to cuSOLVER enum. + * + * 'A' or 'a' -> CUSOLVER_EIG_RANGE_ALL: all eigenvalues + * 'V' or 'v' -> CUSOLVER_EIG_RANGE_V: values in [vl, vu] + * 'I' or 'i' -> CUSOLVER_EIG_RANGE_I: indices in [il, iu] + * + * @param range Character indicating selection mode ('A', 'V', 'I') + * @return Corresponding cusolverEigRange_t enum value + * @throws std::runtime_error if character is invalid + */ +static inline cusolverEigRange_t cublas_eig_range(const char& range) +{ + if (range == 'A' || range == 'a') + return CUSOLVER_EIG_RANGE_ALL; + else if (range == 'V' || range == 'v') + return CUSOLVER_EIG_RANGE_V; + else if (range == 'I' || range == 'i') + return CUSOLVER_EIG_RANGE_I; + else + throw std::runtime_error("cublas_eig_range: unknown range '" + std::string(1, range) + "'"); +} + // cuSOLVER API errors static const char* cusolverGetErrorEnum(cusolverStatus_t error) { @@ -226,4 +249,4 @@ inline void cublasAssert(cublasStatus_t res, const char* file, int line) #define cudaCheckOnDebug() #endif -#endif // BASE_MACROS_CUDA_H_ \ No newline at end of file +#endif // BASE_MACROS_CUDA_H_ diff --git a/source/source_base/module_container/base/third_party/cusolver.h b/source/source_base/module_container/base/third_party/cusolver.h index 6bd137d718..b372babda9 100644 --- a/source/source_base/module_container/base/third_party/cusolver.h +++ b/source/source_base/module_container/base/third_party/cusolver.h @@ -246,6 +246,248 @@ void heevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& u cudaErrcheck(cudaFree(d_work)); } +// ===================================================================================================== +// heevdx: Compute eigenvalues and eigenvectors of symmetric/Hermitian matrix +// ===================================================================================================== +// --- float --- +static inline +void heevdx(cusolverDnHandle_t& cusolver_handle, + const int n, + const int lda, + float* d_A, + const char jobz, + const char uplo, + const char range, + const int il, const int iu, + const float vl, const float vu, + float* d_eigen_val, + int* h_meig) +{ + int lwork = 0; + int* d_info = nullptr; + float* d_work = nullptr; + + cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); + + cusolverEigMode_t jobz_t = cublas_eig_mode(jobz); + cublasFillMode_t uplo_t = cublas_fill_mode(uplo); + cusolverEigRange_t range_t = cublas_eig_range(range); + + cusolverErrcheck(cusolverDnSsyevdx_bufferSize( + cusolver_handle, + jobz_t, range_t, uplo_t, + n, d_A, lda, + vl, vu, il, iu, + h_meig, // ← int* output: number of eigenvalues found + d_eigen_val, // ← const float* W (used for query, can be dummy) + &lwork // ← int* lwork (output) + )); + + cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork)); + + // Main call + cusolverErrcheck(cusolverDnSsyevdx( + cusolver_handle, + jobz_t, range_t, uplo_t, + n, + d_A, lda, + vl, vu, il, iu, + h_meig, // ← int* output + d_eigen_val, // ← float* W: eigenvalues + d_work, lwork, + d_info + )); + + int h_info = 0; + cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); + if (h_info != 0) { + cudaFree(d_info); cudaFree(d_work); + throw std::runtime_error("heevdx (float) failed with info = " + std::to_string(h_info)); + } + + cudaFree(d_info); + cudaFree(d_work); +} + +// --- double --- +static inline +void heevdx(cusolverDnHandle_t& cusolver_handle, + const int n, + const int lda, + double* d_A, + const char jobz, + const char uplo, + const char range, + const int il, const int iu, + const double vl, const double vu, + double* d_eigen_val, + int* h_meig) +{ + int lwork = 0; + int* d_info = nullptr; + double* d_work = nullptr; + + cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); + + cusolverEigMode_t jobz_t = cublas_eig_mode(jobz); + cublasFillMode_t uplo_t = cublas_fill_mode(uplo); + cusolverEigRange_t range_t = cublas_eig_range(range); + + cusolverErrcheck(cusolverDnDsyevdx_bufferSize( + cusolver_handle, + jobz_t, range_t, uplo_t, + n, d_A, lda, + vl, vu, il, iu, + h_meig, + d_eigen_val, + &lwork + )); + + cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork)); + + cusolverErrcheck(cusolverDnDsyevdx( + cusolver_handle, + jobz_t, range_t, uplo_t, + n, + d_A, lda, + vl, vu, il, iu, + h_meig, + d_eigen_val, + d_work, lwork, + d_info + )); + + int h_info = 0; + cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); + if (h_info != 0) { + cudaFree(d_info); cudaFree(d_work); + throw std::runtime_error("heevdx (double) failed with info = " + std::to_string(h_info)); + } + + cudaFree(d_info); + cudaFree(d_work); +} + +// --- complex --- +static inline +void heevdx(cusolverDnHandle_t& cusolver_handle, + const int n, + const int lda, + std::complex* d_A, + const char jobz, + const char uplo, + const char range, + const int il, const int iu, + const float vl, const float vu, + float* d_eigen_val, + int* h_meig) +{ + int lwork = 0; + int* d_info = nullptr; + cuComplex* d_work = nullptr; + + cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); + + cusolverEigMode_t jobz_t = cublas_eig_mode(jobz); + cublasFillMode_t uplo_t = cublas_fill_mode(uplo); + cusolverEigRange_t range_t = cublas_eig_range(range); + + cusolverErrcheck(cusolverDnCheevdx_bufferSize( + cusolver_handle, + jobz_t, range_t, uplo_t, + n, + reinterpret_cast(d_A), lda, + vl, vu, il, iu, + h_meig, + d_eigen_val, + &lwork + )); + + cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork)); + + cusolverErrcheck(cusolverDnCheevdx( + cusolver_handle, + jobz_t, range_t, uplo_t, + n, + reinterpret_cast(d_A), lda, + vl, vu, il, iu, + h_meig, + d_eigen_val, + d_work, lwork, + d_info + )); + + int h_info = 0; + cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); + if (h_info != 0) { + cudaFree(d_info); cudaFree(d_work); + throw std::runtime_error("heevdx (complex) failed with info = " + std::to_string(h_info)); + } + + cudaFree(d_info); + cudaFree(d_work); +} + +// --- complex --- +static inline +void heevdx(cusolverDnHandle_t& cusolver_handle, + const int n, + const int lda, + std::complex* d_A, + const char jobz, + const char uplo, + const char range, + const int il, const int iu, + const double vl, const double vu, + double* d_eigen_val, + int* h_meig) +{ + int lwork = 0; + int* d_info = nullptr; + cuDoubleComplex* d_work = nullptr; + + cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); + + cusolverEigMode_t jobz_t = cublas_eig_mode(jobz); + cublasFillMode_t uplo_t = cublas_fill_mode(uplo); + cusolverEigRange_t range_t = cublas_eig_range(range); + + cusolverErrcheck(cusolverDnZheevdx_bufferSize( + cusolver_handle, + jobz_t, range_t, uplo_t, + n, + reinterpret_cast(d_A), lda, + vl, vu, il, iu, + h_meig, + d_eigen_val, + &lwork + )); + + cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork)); + + cusolverErrcheck(cusolverDnZheevdx( + cusolver_handle, + jobz_t, range_t, uplo_t, + n, + reinterpret_cast(d_A), lda, + vl, vu, il, iu, + h_meig, + d_eigen_val, + d_work, lwork, + d_info + )); + + int h_info = 0; + cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); + if (h_info != 0) { + cudaFree(d_info); cudaFree(d_work); + throw std::runtime_error("heevdx (complex) failed with info = " + std::to_string(h_info)); + } + + cudaFree(d_info); + cudaFree(d_work); +} + static inline void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, float* A, const int& lda, float* B, const int& ldb, float * W) { From 93da75065fe6aba566e96c9c251fd0e7a4dcf55c Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Sat, 18 Oct 2025 01:01:09 +0800 Subject: [PATCH 06/17] Fix lapack_heevx and add template instantiation --- .../ATen/kernels/cuda/lapack.cu | 7 +++++- .../module_container/ATen/kernels/lapack.cpp | 23 ++++++++++++------- .../module_container/ATen/kernels/lapack.h | 2 +- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu index b72ffbb3c0..bef3d04044 100644 --- a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu +++ b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu @@ -107,7 +107,7 @@ struct lapack_heevx { void operator()( const int n, const int lda, - T *d_Mat, + const T *d_Mat, const int neig, Real *d_eigen_val, T *d_eigen_vec) @@ -232,6 +232,11 @@ template struct lapack_heevd; template struct lapack_heevd, DEVICE_GPU>; template struct lapack_heevd, DEVICE_GPU>; +template struct lapack_heevx; +template struct lapack_heevx; +template struct lapack_heevx, DEVICE_GPU>; +template struct lapack_heevx, DEVICE_GPU>; + template struct lapack_hegvd; template struct lapack_hegvd; template struct lapack_hegvd, DEVICE_GPU>; diff --git a/source/source_base/module_container/ATen/kernels/lapack.cpp b/source/source_base/module_container/ATen/kernels/lapack.cpp index 3780caadf6..0003c23a46 100644 --- a/source/source_base/module_container/ATen/kernels/lapack.cpp +++ b/source/source_base/module_container/ATen/kernels/lapack.cpp @@ -1,9 +1,11 @@ +#include "source_base/module_device/types.h" #include #include // #include // std::memcpy #include // std::copy +#include #include #include @@ -108,7 +110,7 @@ struct lapack_heevx { void operator()( const int n, const int lda, - T *Mat, + const T *Mat, const int neig, Real *eigen_val, T *eigen_vec) @@ -116,7 +118,7 @@ struct lapack_heevx { Tensor aux(DataTypeToEnum::value, DeviceType::CpuDevice, {n * lda}); // Copy Mat to aux since heevx will destroy it // aux = Mat - std::copy(Mat, Mat + n * lda, aux); + std::copy(Mat, Mat + n * lda, aux.data()); char jobz = 'V'; // Compute eigenvalues and eigenvectors char range = 'I'; // Find eigenvalues in index range [il, iu] @@ -139,17 +141,17 @@ struct lapack_heevx { // when lwork = -1 lapackConnector::heevx( jobz, range, uplo, n, - aux, lda, + aux.data(), lda, 0.0, 0.0, il, iu, // vl, vu not used when range='I' abstol, - &found, + found, eigen_val, eigen_vec, lda, &work_query, lwork, &rwork_query, &iwork_query, &ifail_query, - &info); + info); if (info != 0) { throw std::runtime_error("heevx workspace query failed with info = " + std::to_string(info)); @@ -173,17 +175,17 @@ struct lapack_heevx { // Actual call to heevx lapackConnector::heevx( jobz, range, uplo, n, - aux, lda, + aux.data(), lda, 0.0, 0.0, il, iu, abstol, - &found, + found, eigen_val, eigen_vec, lda, work.data(), lwork, rwork.data(), iwork.data(), ifail.data(), - &info); + info); if (info != 0) { throw std::runtime_error("heevx failed with info = " + std::to_string(info)); @@ -388,6 +390,11 @@ template struct lapack_heevd; template struct lapack_heevd, DEVICE_CPU>; template struct lapack_heevd, DEVICE_CPU>; +template struct lapack_heevx; +template struct lapack_heevx; +template struct lapack_heevx, DEVICE_CPU>; +template struct lapack_heevx, DEVICE_CPU>; + template struct lapack_hegvd; template struct lapack_hegvd; template struct lapack_hegvd, DEVICE_CPU>; diff --git a/source/source_base/module_container/ATen/kernels/lapack.h b/source/source_base/module_container/ATen/kernels/lapack.h index 8ec3bc73e1..32778a5cc5 100644 --- a/source/source_base/module_container/ATen/kernels/lapack.h +++ b/source/source_base/module_container/ATen/kernels/lapack.h @@ -81,7 +81,7 @@ struct lapack_heevx { void operator()( const int dim, const int lda, - T *Mat, + const T *Mat, const int neig, Real *eigen_val, T *eigen_vec); From 233254a2eb76e8c45698325737c2d2d91c243d94 Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Sat, 18 Oct 2025 01:01:52 +0800 Subject: [PATCH 07/17] Replace hsolver::heevx_op with ct::kernels::lapack_heevx in diago_david --- source/source_hsolver/diago_david.cpp | 12 +++++++----- source/source_hsolver/diago_david.h | 4 +++- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/source/source_hsolver/diago_david.cpp b/source/source_hsolver/diago_david.cpp index 607041089f..bf390104af 100644 --- a/source/source_hsolver/diago_david.cpp +++ b/source/source_hsolver/diago_david.cpp @@ -12,7 +12,7 @@ using namespace hsolver; template -DiagoDavid::DiagoDavid(const Real* precondition_in, +DiagoDavid::DiagoDavid(const Real* precondition_in, const int nband_in, const int dim_in, const int david_ndim_in, @@ -80,7 +80,7 @@ DiagoDavid::DiagoDavid(const Real* precondition_in, resmem_complex_op()(this->vcc, nbase_x * nbase_x, "DAV::vcc"); setmem_complex_op()(this->vcc, 0, nbase_x * nbase_x); //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - + // lagrange_matrix(nband, nband); // for orthogonalization resmem_complex_op()(this->lagrange_matrix, nband * nband); setmem_complex_op()(this->lagrange_matrix, 0, nband * nband); @@ -409,7 +409,7 @@ void DiagoDavid::cal_grad(const HPsiFunc& hpsi_func, // basis[nbase] = basis[nbase] - spsi * vc_ev_vector // = hpsi - spsi * lambda * vcc // = (H - lambda * S) * psi * vcc - // = (H - lambda * S) * psi_new + // = (H - lambda * S) * psi_new //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< ModuleBase::gemm_op()('N', 'N', @@ -622,7 +622,8 @@ void DiagoDavid::diag_zhegvx(const int& nbase, resmem_var_op()(eigenvalue_gpu, nbase_x); syncmem_var_h2d_op()(eigenvalue_gpu, this->eigenvalue, nbase_x); - heevx_op()(this->ctx, nbase, nbase_x, hcc, nband, eigenvalue_gpu, vcc); + // heevx_op()(this->ctx, nbase, nbase_x, hcc, nband, eigenvalue_gpu, vcc); + ct::kernels::lapack_heevx()(nbase, nbase_x, hcc, nband, eigenvalue_gpu, vcc); syncmem_var_d2h_op()(this->eigenvalue, eigenvalue_gpu, nbase_x); delmem_var_op()(eigenvalue_gpu); @@ -630,7 +631,8 @@ void DiagoDavid::diag_zhegvx(const int& nbase, } else { - heevx_op()(this->ctx, nbase, nbase_x, hcc, nband, this->eigenvalue, vcc); + //heevx_op()(this->ctx, nbase, nbase_x, hcc, nband, this->eigenvalue, vcc); + ct::kernels::lapack_heevx()(nbase, nbase_x, hcc, nband, this->eigenvalue, vcc); } } diff --git a/source/source_hsolver/diago_david.h b/source/source_hsolver/diago_david.h index 02d2f46697..75a745d326 100644 --- a/source/source_hsolver/diago_david.h +++ b/source/source_hsolver/diago_david.h @@ -5,7 +5,7 @@ #include "source_base/module_device/device.h" // base_device #include "source_base/module_device/memory_op.h"// base_device::memory -// #include "source_base/module_container/ATen/kernels/lapack.h" // container::kernels +#include "source_base/module_container/ATen/kernels/lapack.h" // container::kernels #include "source_hsolver/diag_comm_info.h" #include "source_hsolver/kernels/hegvd_op.h" @@ -341,6 +341,8 @@ class DiagoDavid using syncmem_h2d_op = base_device::memory::synchronize_memory_op; using syncmem_d2h_op = base_device::memory::synchronize_memory_op; + // Note that ct_Device is different from base_device! + using ct_Device = typename ct::PsiToContainer::type; // using hpsi_info = typename hamilt::Operator::hpsi_info; // Dependence of hpsi removed const T *one = nullptr, *zero = nullptr, *neg_one = nullptr; From c42bbc04f202dac17662debe1177a4b203a314a3 Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Mon, 20 Oct 2025 01:37:21 +0800 Subject: [PATCH 08/17] Add lapack_hegvx --- .../ATen/kernels/cuda/lapack.cu | 53 +- .../module_container/ATen/kernels/lapack.cpp | 180 +++--- .../module_container/ATen/kernels/lapack.h | 64 +- .../base/third_party/cusolver.h | 567 ++++++++++++------ 4 files changed, 540 insertions(+), 324 deletions(-) diff --git a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu index bef3d04044..34fca37898 100644 --- a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu +++ b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu @@ -149,26 +149,40 @@ struct lapack_hegvd { const char jobz = 'V'; const char uplo = 'U'; cudaErrcheck(cudaMemcpy(eigen_vec, Mat_A, sizeof(T) * dim * lda, cudaMemcpyDeviceToDevice)); - cuSolverConnector::hegvd(cusolver_handle, itype, jobz, uplo, dim, eigen_vec, lda, Mat_B, lda, eigen_val); + cuSolverConnector::hegvd(cusolver_handle, itype, jobz, uplo, dim, + eigen_vec, lda, Mat_B, lda, + eigen_val); } }; -// template -// struct lapack_hegvx { -// using Real = typename GetTypeReal::type; -// void operator()( -// const int n, -// const int lda, -// T *A, -// T *B, -// const int m, -// Real *eigen_val, -// T *eigen_vec) -// { -// cuSolverConnector::hegvx(cusolver_handle, n, lda, A, B, m, eigen_val, eigen_vec); -// } -// }; -// +template +struct lapack_hegvx { + using Real = typename GetTypeReal::type; + void operator()( + const int n, + const int lda, + T *A, + T *B, + const int m, + Real *eigen_val, + T *eigen_vec) + { + const int itype = 1; + const char jobz = 'V'; + const char range = 'I'; + const char uplo = 'U'; + int meig = 0; + cuSolverConnector::hegvdx(cusolver_handle, + itype, jobz, range, uplo, + n, lda, A, B, + Real(0), Real(0), + 1, m, &meig, + eigen_val, eigen_vec); + } +}; + + + template struct lapack_getrf { void operator()( @@ -242,6 +256,11 @@ template struct lapack_hegvd; template struct lapack_hegvd, DEVICE_GPU>; template struct lapack_hegvd, DEVICE_GPU>; +template struct lapack_hegvx; +template struct lapack_hegvx; +template struct lapack_hegvx, DEVICE_GPU>; +template struct lapack_hegvx, DEVICE_GPU>; + template struct lapack_getrf; template struct lapack_getrf; template struct lapack_getrf, DEVICE_GPU>; diff --git a/source/source_base/module_container/ATen/kernels/lapack.cpp b/source/source_base/module_container/ATen/kernels/lapack.cpp index 0003c23a46..185707ff8f 100644 --- a/source/source_base/module_container/ATen/kernels/lapack.cpp +++ b/source/source_base/module_container/ATen/kernels/lapack.cpp @@ -217,7 +217,7 @@ struct lapack_hegvd { const int itype = 1; const char jobz = 'V'; - const char uplo = 'U'; + const char uplo = 'L'; int info = 0; int lwork = std::max(2 * dim + dim * dim, 1 + 6 * dim + 2 * dim * dim); Tensor work(DataTypeToEnum::value, DeviceType::CpuDevice, {lwork}); @@ -240,80 +240,105 @@ struct lapack_hegvd { }; -// template -// struct lapack_hegvx { -// using Real = typename GetTypeReal::type; -// void operator()( -// const int n, -// const int lda, -// T *A, -// T *B, -// const int m, -// Real *eigen_val, -// T *eigen_vec) -// { -// int info = 0; - -// int mm = m; - -// int lwork = -1; - -// T *work = new T[1]; -// Real *rwork = new Real[7 * n]; -// int *iwork = new int[5 * n]; -// int *ifail = new int[n]; - -// // set lwork = -1 to query optimal work size -// lapackConnector::hegvx(1, // ITYPE = 1: A*x = (lambda)*B*x -// 'V', 'I', 'U', -// n, A, lda, B, lda, -// 0.0, 0.0, -// 1, m, 0.0, mm, -// eigen_val, eigen_vec, lda, -// work, -// lwork, // lwork = 1, query optimal size. -// rwork, iwork, ifail, -// info); - -// // !> If LWORK = -1, then a workspace query is assumed; the routine -// // !> only calculates the optimal size of the WORK array, returns -// // !> this value as the first entry of the WORK array. -// lwork = int(get_real(work[0])); -// delete[] work; -// work = new T[lwork]; - -// lapackConnector::hegvx( -// 1, // ITYPE = 1: A*x = (lambda)*B*x -// 'V', // JOBZ = 'V': Compute eigenvalues and eigenvectors. -// 'I', // RANGE = 'I': the IL-th through IU-th eigenvalues will be found. -// 'U', // UPLO = 'U': Upper triangles of A and B are stored. -// n, // order of the matrices A and B. -// A, // A is COMPLEX*16 array dimension (LDA, N) -// lda, // leading dimension of the array A. -// B, // B is COMPLEX*16 array, dimension (LDB, N) -// lda, // assume that leading dimension of B is the same as A. -// 0.0, // VL, Not referenced if RANGE = 'A' or 'I'. -// 0.0, // VU, Not referenced if RANGE = 'A' or 'I'. -// 1, // IL: If RANGE='I', the index of the smallest eigenvalue to be returned. 1 <= IL <= IU <= N, -// m, // IU: If RANGE='I', the index of the largest eigenvalue to be returned. 1 <= IL <= IU <= N, -// 0.0, // ABSTOL -// mm, // M: The total number of eigenvalues found. 0 <= M <= N. if RANGE = 'I', M = IU-IL+1. -// eigen_val, // W store eigenvalues -// eigen_vec, // Z store eigenvector -// lda, // LDZ: The leading dimension of the array Z. -// work, -// lwork, -// rwork, -// iwork, -// ifail, -// info); - -// delete[] work; -// delete[] rwork; -// delete[] iwork; -// delete[] ifail; -// } -// }; +template +struct lapack_hegvx { + using Real = typename GetTypeReal::type; + void operator()( + const int n, + const int lda, + T *Mat_A, + T *Mat_B, + const int m, + Real *eigen_val, + T *eigen_vec) + { + // first copy Mat_A and Mat_B to auxiliary memory + // to avoid the origin block being overwritten by hegvx + Tensor aux_A(DataTypeToEnum::value, DeviceType::CpuDevice, {n * lda}); + std::copy(Mat_A, Mat_A + n * lda, aux_A.data()); + Tensor aux_B(DataTypeToEnum::value, DeviceType::CpuDevice, {n * lda}); + std::copy(Mat_B, Mat_B + n * lda, aux_B.data()); + + const int itype = 1; // ITYPE = 1: A*x = (lambda)*B*x + const char jobz = 'V';// JOBZ = 'V': Compute eigenvalues and eigenvectors. + const char range = 'I'; // RANGE = 'I': the IL-th through IU-th eigenvalues will be found. + const char uplo = 'L'; // UPLO = 'L': Lower triangles of A and B are stored. + + const int il = 1; + const int iu = m; + int found = m; // Found, should be iu - il + 1 + int info = 0; + + int lwork = -1; + + T work_query; + Real rwork_query; + + // set lwork = -1 to query optimal work size + lapackConnector::hegvx( + itype, jobz, range, uplo, + n, + aux_A.data(), lda, // A (in/out) + aux_B.data(), lda, // B (in/out) + 0.0, 0.0, // VL, VU (not used) + il, iu, // IL, IU + Real(0.0), // ABSTOL + found, // M (output) + eigen_val, // W (output) + eigen_vec, lda, // Z (output) + &work_query, // WORK (query) + lwork, + &rwork_query, // RWORK (query) + static_cast(nullptr), // IWORK (query) + static_cast(nullptr), // IFAIL (query) + info); + + // !> If LWORK = -1, then a workspace query is assumed; the routine + // !> only calculates the optimal size of the WORK array, returns + // !> this value as the first entry of the WORK array. + lwork = static_cast(get_real(work_query)); + lwork = std::max(lwork, 1); + + // work space + Tensor work(DataTypeToEnum::value, DeviceType::CpuDevice, {lwork}); + work.zero(); + + const int lrwork = 7 * n; + Tensor rwork(DataTypeToEnum::value, DeviceType::CpuDevice, {lrwork}); + rwork.zero(); + + const int liwork = 5 * n; + Tensor iwork(DataType::DT_INT, DeviceType::CpuDevice, {liwork}); + iwork.zero(); + + std::vector ifail(n); + + lapackConnector::hegvx( + itype, jobz, range, uplo, + n, + aux_A.data(), lda, // A + aux_B.data(), lda, // B + 0.0, 0.0, // VL, VU + il, iu, // IL, IU + Real(0.0), // ABSTOL + found, // M (output) + eigen_val, // W + eigen_vec, lda, // Z (output) + work.data(), // WORK + lwork, + rwork.data(), // RWORK + iwork.data(), // IWORK + ifail.data(), // IFAIL + info); + + if (info < 0) { + throw std::runtime_error("hegvx failed: illegal argument #" + std::to_string(-info)); + } + if (info > 0) { + throw std::runtime_error("hegvx failed to converge. Number of converged eigenvalues: " + std::to_string(info)); + } + } +}; template struct lapack_getrf { @@ -400,6 +425,11 @@ template struct lapack_hegvd; template struct lapack_hegvd, DEVICE_CPU>; template struct lapack_hegvd, DEVICE_CPU>; +template struct lapack_hegvx; +template struct lapack_hegvx; +template struct lapack_hegvx, DEVICE_CPU>; +template struct lapack_hegvx, DEVICE_CPU>; + template struct lapack_getrf; template struct lapack_getrf; template struct lapack_getrf, DEVICE_CPU>; diff --git a/source/source_base/module_container/ATen/kernels/lapack.h b/source/source_base/module_container/ATen/kernels/lapack.h index 32778a5cc5..2914ee9fe6 100644 --- a/source/source_base/module_container/ATen/kernels/lapack.h +++ b/source/source_base/module_container/ATen/kernels/lapack.h @@ -115,38 +115,38 @@ struct lapack_hegvd { T *eigen_vec); }; -// template -// struct lapack_hegvx { -// using Real = typename GetTypeReal::type; -// /** -// * @ brief hegvx computes the first m eigenvalues and their corresponding eigenvectors of -// * a complex generalized Hermitian-definite eigenproblem. -// * -// * In this op, the CPU version is implemented through the `hegvx` interface, and the CUDA version -// * is implemented through the `evd` interface and acquires the first m eigenpairs -// * -// * hegvx 'V' 'I' 'U' is used to compute the first m eigenpairs of the problem -// * -// * @param n The order of the matrices A and B. n >= 0. -// * @param lda The leading dimension of the array A and B. lda >= max(1, n). -// * @param A On entry, the Hermitian matrix A. On exit, if info = 0, A contains the matrix Z of eigenvectors. -// * @param B On entry, the Hermitian positive definite matrix B. On exit, the triangular factor from the Cholesky factorization of B. -// * @param m The number of eigenvalues and eigenvectors to be found. 0 < m <= n. -// * @param eigen_val The first m eigenvalues in ascending order. -// * @param eigen_vec The first m columns contain the orthonormal eigenvectors of the matrix A corresponding to the selected eigenvalues. -// * -// * @note -// * See LAPACK ZHEGVX doc for more details. -// */ -// void operator()( -// const int n, -// const int lda, -// T *A, -// T *B, -// const int m, -// Real *eigen_val, -// T *eigen_vec); -// }; +template +struct lapack_hegvx { + using Real = typename GetTypeReal::type; + /** + * @ brief hegvx computes the first m eigenvalues and their corresponding eigenvectors of + * a complex generalized Hermitian-definite eigenproblem. + * + * In this op, the CPU version is implemented through the `hegvx` interface, and the CUDA version + * is implemented through the `evd` interface and acquires the first m eigenpairs + * + * hegvx 'V' 'I' 'U' is used to compute the first m eigenpairs of the problem + * + * @param n The order of the matrices A and B. n >= 0. + * @param lda The leading dimension of the array A and B. lda >= max(1, n). + * @param A On entry, the Hermitian matrix A. On exit, if info = 0, A contains the matrix Z of eigenvectors. + * @param B On entry, the Hermitian positive definite matrix B. On exit, the triangular factor from the Cholesky factorization of B. + * @param m The number of eigenvalues and eigenvectors to be found. 0 < m <= n. + * @param eigen_val The first m eigenvalues in ascending order. + * @param eigen_vec The first m columns contain the orthonormal eigenvectors of the matrix A corresponding to the selected eigenvalues. + * + * @note + * See LAPACK ZHEGVX doc for more details. + */ + void operator()( + const int n, + const int lda, + T *Mat_A, + T *Mat_B, + const int m, + Real *eigen_val, + T *eigen_vec); +}; template diff --git a/source/source_base/module_container/base/third_party/cusolver.h b/source/source_base/module_container/base/third_party/cusolver.h index b372babda9..affdc4ca48 100644 --- a/source/source_base/module_container/base/third_party/cusolver.h +++ b/source/source_base/module_container/base/third_party/cusolver.h @@ -593,206 +593,373 @@ void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& j cudaErrcheck(cudaFree(d_work)); } -// ----------------------------- -// CUDA hegvx implementations -// ----------------------------- - -// static inline -// void hegvx(cusolverDnHandle_t& cusolver_handle, -// const int& itype, -// const char& jobz, -// const char& uplo, -// const int& n, -// float* A, -// const int& lda, -// float* B, -// const int& ldb, -// const int& il, -// const int& iu, -// const float& abstol, -// int& m_out, -// float* W, -// float* Z, -// const int& ldz) { -// // Step 1: Query workspace size -// int lwork = 0; -// cusolverErrcheck(cusolverDnSsygvdx_bufferSize( -// cusolver_handle, -// cublas_eig_type(itype), // itype -// cublas_eig_mode(jobz), // jobz -// cublas_fill_mode(uplo), // uplo -// n, A, lda, B, ldb, -// il, iu, // range: index interval -// abstol, // tolerance -// &m_out, // output: number of eigenvalues found -// W, // eigenvalues -// Z, ldz, // eigenvectors -// &lwork)); // workspace size - -// // Step 2: Allocate workspace and info -// float* d_work = nullptr; -// int* d_info = nullptr; -// cudaErrcheck(cudaMalloc(&d_work, sizeof(float) * lwork)); -// cudaErrcheck(cudaMalloc(&d_info, sizeof(int))); - -// // Step 3: Call the solver -// cusolverErrcheck(cusolverDnSsygvdx( -// cusolver_handle, -// cublas_eig_type(itype), -// cublas_eig_mode(jobz), -// cublas_fill_mode(uplo), -// n, A, lda, B, ldb, -// il, iu, abstol, &m_out, W, Z, ldz, d_work, lwork, d_info)); - -// // Step 4: Check result -// int h_info = 0; -// cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); -// if (h_info != 0) { -// throw std::runtime_error("hegvx: cusolverDnSsygvdx failed with info = " + std::to_string(h_info)); -// } - -// // Cleanup -// cudaErrcheck(cudaFree(d_work)); -// cudaErrcheck(cudaFree(d_info)); -// } - -// Double precision -// static inline -// void hegvx(cusolverDnHandle_t& cusolver_handle, -// const int& itype, -// const char& jobz, -// const char& uplo, -// const int& n, -// double* A, -// const int& lda, -// double* B, -// const int& ldb, -// const int& il, -// const int& iu, -// const double& abstol, -// int& m_out, -// double* W, -// double* Z, -// const int& ldz) { -// int lwork = 0; -// cusolverErrcheck(cusolverDnDsygvdx_bufferSize( -// cusolver_handle, itype, jobz, uplo, n, A, lda, B, ldb, -// il, iu, abstol, &m_out, W, Z, ldz, &lwork)); - -// double* d_work = nullptr; -// int* d_info = nullptr; -// cudaErrcheck(cudaMalloc(&d_work, sizeof(double) * lwork)); -// cudaErrcheck(cudaMalloc(&d_info, sizeof(int))); - -// cusolverErrcheck(cusolverDnDsygvdx( -// cusolver_handle, itype, jobz, uplo, n, A, lda, B, ldb, -// il, iu, abstol, &m_out, W, Z, ldz, d_work, lwork, d_info)); - -// int h_info = 0; -// cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); -// if (h_info != 0) { -// throw std::runtime_error("hegvx: cusolverDnDsygvdx failed with info = " + std::to_string(h_info)); -// } - -// cudaErrcheck(cudaFree(d_work)); -// cudaErrcheck(cudaFree(d_info)); -// } - -// // Complex single precision -// static inline -// void hegvx(cusolverDnHandle_t& cusolver_handle, -// const int& itype, -// const char& jobz, -// const char& uplo, -// const int& n, -// cuComplex* A, -// const int& lda, -// cuComplex* B, -// const int& ldb, -// const int& il, -// const int& iu, -// const float& abstol, -// int& m_out, -// float* W, -// cuComplex* Z, -// const int& ldz) { -// int lwork = 0; -// cusolverErrcheck(cusolverDnChegvdx_bufferSize( -// cusolver_handle, itype, jobz, uplo, n, -// reinterpret_cast(A), lda, -// reinterpret_cast(B), ldb, -// il, iu, abstol, &m_out, W, -// reinterpret_cast(Z), ldz, &lwork)); - -// cuComplex* d_work = nullptr; -// int* d_info = nullptr; -// cudaErrcheck(cudaMalloc(&d_work, sizeof(cuComplex) * lwork)); -// cudaErrcheck(cudaMalloc(&d_info, sizeof(int))); - -// cusolverErrcheck(cusolverDnChegvdx( -// cusolver_handle, itype, jobz, uplo, n, -// reinterpret_cast(A), lda, -// reinterpret_cast(B), ldb, -// il, iu, abstol, &m_out, W, -// reinterpret_cast(Z), ldz, d_work, lwork, d_info)); - -// int h_info = 0; -// cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); -// if (h_info != 0) { -// throw std::runtime_error("hegvx: cusolverDnChegvdx failed with info = " + std::to_string(h_info)); -// } - -// cudaErrcheck(cudaFree(d_work)); -// cudaErrcheck(cudaFree(d_info)); -// } - -// // Complex double precision -// static inline -// void hegvx(cusolverDnHandle_t& cusolver_handle, -// const int& itype, -// const char& jobz, -// const char& uplo, -// const int& n, -// cuDoubleComplex* A, -// const int& lda, -// cuDoubleComplex* B, -// const int& ldb, -// const int& il, -// const int& iu, -// const double& abstol, -// int& m_out, -// double* W, -// cuDoubleComplex* Z, -// const int& ldz) { -// int lwork = 0; -// cusolverErrcheck(cusolverDnZhegvdx_bufferSize( -// cusolver_handle, itype, jobz, uplo, n, -// reinterpret_cast(A), lda, -// reinterpret_cast(B), ldb, -// il, iu, abstol, &m_out, W, -// reinterpret_cast(Z), ldz, &lwork)); - -// cuDoubleComplex* d_work = nullptr; -// int* d_info = nullptr; -// cudaErrcheck(cudaMalloc(&d_work, sizeof(cuDoubleComplex) * lwork)); -// cudaErrcheck(cudaMalloc(&d_info, sizeof(int))); - -// cusolverErrcheck(cusolverDnZhegvdx( -// cusolver_handle, itype, jobz, uplo, n, -// reinterpret_cast(A), lda, -// reinterpret_cast(B), ldb, -// il, iu, abstol, &m_out, W, -// reinterpret_cast(Z), ldz, d_work, lwork, d_info)); - -// int h_info = 0; -// cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); -// if (h_info != 0) { -// throw std::runtime_error("hegvx: cusolverDnZhegvdx failed with info = " + std::to_string(h_info)); -// } - -// cudaErrcheck(cudaFree(d_work)); -// cudaErrcheck(cudaFree(d_info)); -// } +// ===================================================================================================== +// hegvd x: Compute selected eigenvalues and eigenvectors of generalized Hermitian-definite eigenproblem +// A * x = lambda * B * x +// ===================================================================================================== + +// --- float --- +static inline +void hegvdx( + cusolverDnHandle_t& cusolver_handle, + const int itype, // 1: A*x = lambda*B*x + const char jobz, // 'V' or 'N' + const char range, // 'I', 'V', 'A' + const char uplo, // 'U' or 'L' + const int n, + const int lda, + float* d_A, // Input matrix A (device) + float* d_B, // Input matrix B (device) + const float vl, // for RANGE='V' + const float vu, + const int il, // for RANGE='I' + const int iu, + int* h_meig, // output: number of eigenvalues found + float* d_eigen_val, // output: eigenvalues + float* d_eigen_vec // output: eigenvectors (if jobz='V'), size ldz × m +) { + int lwork = 0; + int *d_info = nullptr; + float *d_work = nullptr; + + // Allocate device info + cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); + + // Copy A and B to temporary buffers since sygvdx may modify them + float *d_A_copy = nullptr, *d_B_copy = nullptr; + cudaErrcheck(cudaMalloc((void**)&d_A_copy, sizeof(float) * n * lda)); + cudaErrcheck(cudaMalloc((void**)&d_B_copy, sizeof(float) * n * lda)); + cudaErrcheck(cudaMemcpy(d_A_copy, d_A, sizeof(float) * n * lda, cudaMemcpyDeviceToDevice)); + cudaErrcheck(cudaMemcpy(d_B_copy, d_B, sizeof(float) * n * lda, cudaMemcpyDeviceToDevice)); + + // Set parameters + cusolverEigType_t itype_t = cublas_eig_type(itype); + cusolverEigMode_t jobz_t = cublas_eig_mode(jobz); + cusolverEigRange_t range_t = cublas_eig_range(range); + cublasFillMode_t uplo_t = cublas_fill_mode(uplo); + + // Query workspace size + cusolverErrcheck(cusolverDnSsygvdx_bufferSize( + cusolver_handle, + itype_t, jobz_t, range_t, uplo_t, + n, + d_A_copy, lda, + d_B_copy, lda, + vl, vu, il, iu, + h_meig, + d_eigen_val, + &lwork + )); + + // Allocate workspace + cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork)); + + // Main call + cusolverErrcheck(cusolverDnSsygvdx( + cusolver_handle, + itype_t, jobz_t, range_t, uplo_t, + n, + d_A_copy, lda, + d_B_copy, lda, + vl, vu, il, iu, + h_meig, + d_eigen_val, + d_work, lwork, + d_info + )); + + // Check result + int h_info = 0; + cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); + if (h_info < 0) { + throw std::runtime_error("hegvdx (float): illegal argument #" + std::to_string(-h_info)); + } else if (h_info > 0) { + // If h_info <= n: convergence issue in tridiag solver (no vec) OR + // If h_info > n: B's leading minor of order (h_info - n) is not positive definite + if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) { + throw std::runtime_error("hegvdx (float): failed to converge, " + std::to_string(h_info) + " off-diagonal elements didn't converge"); + } else if (h_info > n) { + throw std::runtime_error("hegvdx (float): leading minor of order " + std::to_string(h_info - n) + " of B is not positive definite"); + } + } + + // If jobz == 'V', copy eigenvectors from A (which now contains Z) to output + if (jobz == 'V') { + const int m = (*h_meig); // number of eigenvectors computed + cudaErrcheck(cudaMemcpy(d_eigen_vec, d_A_copy, sizeof(float) * n * m, cudaMemcpyDeviceToDevice)); + } + + // Cleanup + cudaFree(d_info); + cudaFree(d_work); + cudaFree(d_A_copy); + cudaFree(d_B_copy); +} + + +// --- double --- +static inline +void hegvdx( + cusolverDnHandle_t& cusolver_handle, + const int itype, + const char jobz, + const char range, + const char uplo, + const int n, + const int lda, + double* d_A, + double* d_B, + const double vl, + const double vu, + const int il, + const int iu, + int* h_meig, + double* d_eigen_val, + double* d_eigen_vec +) { + int lwork = 0; + int *d_info = nullptr; + double *d_work = nullptr; + + cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); + + double *d_A_copy = nullptr, *d_B_copy = nullptr; + cudaErrcheck(cudaMalloc((void**)&d_A_copy, sizeof(double) * n * lda)); + cudaErrcheck(cudaMalloc((void**)&d_B_copy, sizeof(double) * n * lda)); + cudaErrcheck(cudaMemcpy(d_A_copy, d_A, sizeof(double) * n * lda, cudaMemcpyDeviceToDevice)); + cudaErrcheck(cudaMemcpy(d_B_copy, d_B, sizeof(double) * n * lda, cudaMemcpyDeviceToDevice)); + + cusolverEigType_t itype_t = cublas_eig_type(itype); + cusolverEigMode_t jobz_t = cublas_eig_mode(jobz); + cusolverEigRange_t range_t = cublas_eig_range(range); + cublasFillMode_t uplo_t = cublas_fill_mode(uplo); + + cusolverErrcheck(cusolverDnDsygvdx_bufferSize( + cusolver_handle, + itype_t, jobz_t, range_t, uplo_t, + n, + d_A_copy, lda, + d_B_copy, lda, + vl, vu, il, iu, + h_meig, + d_eigen_val, + &lwork + )); + + cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork)); + + cusolverErrcheck(cusolverDnDsygvdx( + cusolver_handle, + itype_t, jobz_t, range_t, uplo_t, + n, + d_A_copy, lda, + d_B_copy, lda, + vl, vu, il, iu, + h_meig, + d_eigen_val, + d_work, lwork, + d_info + )); + + int h_info = 0; + cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); + if (h_info < 0) { + throw std::runtime_error("hegvdx (double): illegal argument #" + std::to_string(-h_info)); + } else if (h_info > 0) { + if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) { + throw std::runtime_error("hegvdx (double): failed to converge, " + std::to_string(h_info) + " off-diagonal elements didn't converge"); + } else if (h_info > n) { + throw std::runtime_error("hegvdx (double): leading minor of order " + std::to_string(h_info - n) + " of B is not positive definite"); + } + } + + if (jobz == 'V') { + const int m = (*h_meig); + cudaErrcheck(cudaMemcpy(d_eigen_vec, d_A_copy, sizeof(double) * n * m, cudaMemcpyDeviceToDevice)); + } + + cudaFree(d_info); + cudaFree(d_work); + cudaFree(d_A_copy); + cudaFree(d_B_copy); +} + + +// --- complex --- +static inline +void hegvdx( + cusolverDnHandle_t& cusolver_handle, + const int itype, + const char jobz, + const char range, + const char uplo, + const int n, + const int lda, + std::complex* d_A, + std::complex* d_B, + const float vl, + const float vu, + const int il, + const int iu, + int* h_meig, + float* d_eigen_val, + std::complex* d_eigen_vec +) { + int lwork = 0; + int *d_info = nullptr; + cuComplex *d_work = nullptr; + + cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); + + cuComplex *d_A_copy = nullptr, *d_B_copy = nullptr; + cudaErrcheck(cudaMalloc((void**)&d_A_copy, sizeof(cuComplex) * n * lda)); + cudaErrcheck(cudaMalloc((void**)&d_B_copy, sizeof(cuComplex) * n * lda)); + cudaErrcheck(cudaMemcpy(d_A_copy, reinterpret_cast(d_A), sizeof(cuComplex) * n * lda, cudaMemcpyDeviceToDevice)); + cudaErrcheck(cudaMemcpy(d_B_copy, reinterpret_cast(d_B), sizeof(cuComplex) * n * lda, cudaMemcpyDeviceToDevice)); + + cusolverEigType_t itype_t = cublas_eig_type(itype); + cusolverEigMode_t jobz_t = cublas_eig_mode(jobz); + cusolverEigRange_t range_t = cublas_eig_range(range); + cublasFillMode_t uplo_t = cublas_fill_mode(uplo); + + cusolverErrcheck(cusolverDnChegvdx_bufferSize( + cusolver_handle, + itype_t, jobz_t, range_t, uplo_t, + n, + d_A_copy, lda, + d_B_copy, lda, + vl, vu, il, iu, + h_meig, + d_eigen_val, + &lwork + )); + + cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork)); + + cusolverErrcheck(cusolverDnChegvdx( + cusolver_handle, + itype_t, jobz_t, range_t, uplo_t, + n, + d_A_copy, lda, + d_B_copy, lda, + vl, vu, il, iu, + h_meig, + d_eigen_val, + d_work, lwork, + d_info + )); + + int h_info = 0; + cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); + if (h_info < 0) { + throw std::runtime_error("hegvdx (complex): illegal argument #" + std::to_string(-h_info)); + } else if (h_info > 0) { + if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) { + throw std::runtime_error("hegvdx (complex): failed to converge, " + std::to_string(h_info) + " off-diagonal elements didn't converge"); + } else if (h_info > n) { + throw std::runtime_error("hegvdx (complex): leading minor of order " + std::to_string(h_info - n) + " of B is not positive definite"); + } + } + + if (jobz == 'V') { + const int m = (*h_meig); + cudaErrcheck(cudaMemcpy(reinterpret_cast(d_eigen_vec), d_A_copy, sizeof(cuComplex) * n * m, cudaMemcpyDeviceToDevice)); + } + + cudaFree(d_info); + cudaFree(d_work); + cudaFree(d_A_copy); + cudaFree(d_B_copy); +} + + +// --- complex --- +static inline +void hegvdx( + cusolverDnHandle_t& cusolver_handle, + const int itype, + const char jobz, + const char range, + const char uplo, + const int n, + const int lda, + std::complex* d_A, + std::complex* d_B, + const double vl, + const double vu, + const int il, + const int iu, + int* h_meig, + double* d_eigen_val, + std::complex* d_eigen_vec +) { + int lwork = 0; + int *d_info = nullptr; + cuDoubleComplex *d_work = nullptr; + + cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int))); + + cuDoubleComplex *d_A_copy = nullptr, *d_B_copy = nullptr; + cudaErrcheck(cudaMalloc((void**)&d_A_copy, sizeof(cuDoubleComplex) * n * lda)); + cudaErrcheck(cudaMalloc((void**)&d_B_copy, sizeof(cuDoubleComplex) * n * lda)); + cudaErrcheck(cudaMemcpy(d_A_copy, reinterpret_cast(d_A), sizeof(cuDoubleComplex) * n * lda, cudaMemcpyDeviceToDevice)); + cudaErrcheck(cudaMemcpy(d_B_copy, reinterpret_cast(d_B), sizeof(cuDoubleComplex) * n * lda, cudaMemcpyDeviceToDevice)); + + cusolverEigType_t itype_t = cublas_eig_type(itype); + cusolverEigMode_t jobz_t = cublas_eig_mode(jobz); + cusolverEigRange_t range_t = cublas_eig_range(range); + cublasFillMode_t uplo_t = cublas_fill_mode(uplo); + + cusolverErrcheck(cusolverDnZhegvdx_bufferSize( + cusolver_handle, + itype_t, jobz_t, range_t, uplo_t, + n, + d_A_copy, lda, + d_B_copy, lda, + vl, vu, il, iu, + h_meig, + d_eigen_val, + &lwork + )); + + cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork)); + + cusolverErrcheck(cusolverDnZhegvdx( + cusolver_handle, + itype_t, jobz_t, range_t, uplo_t, + n, + d_A_copy, lda, + d_B_copy, lda, + vl, vu, il, iu, + h_meig, + d_eigen_val, + d_work, lwork, + d_info + )); + + int h_info = 0; + cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); + if (h_info < 0) { + throw std::runtime_error("hegvdx (complex): illegal argument #" + std::to_string(-h_info)); + } else if (h_info > 0) { + if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) { + throw std::runtime_error("hegvdx (complex): failed to converge, " + std::to_string(h_info) + " off-diagonal elements didn't converge"); + } else if (h_info > n) { + throw std::runtime_error("hegvdx (complex): leading minor of order " + std::to_string(h_info - n) + " of B is not positive definite"); + } + } + + if (jobz == 'V') { + const int m = (*h_meig); + cudaErrcheck(cudaMemcpy(reinterpret_cast(d_eigen_vec), d_A_copy, sizeof(cuDoubleComplex) * n * m, cudaMemcpyDeviceToDevice)); + } + + cudaFree(d_info); + cudaFree(d_work); + cudaFree(d_A_copy); + cudaFree(d_B_copy); +} + // --- getrf static inline From 4ddd934b1941f110f0f62fa838a438b87cc7d776 Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Mon, 20 Oct 2025 01:42:19 +0800 Subject: [PATCH 09/17] Fix hip hegvd --- .../module_container/ATen/kernels/rocm/lapack.hip.cu | 4 ---- 1 file changed, 4 deletions(-) diff --git a/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu b/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu index 93f0fa481b..07572a657a 100644 --- a/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu +++ b/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu @@ -120,12 +120,8 @@ struct lapack_hegvd { void operator()( const int dim, const int lda, - const int& itype, - const char& jobz, - const char& uplo, T* Mat_A, T* Mat_B, - const int& dim, Real* eigen_val, T *eigen_vec) { From 75dbbb1fbd5a1875089d641ba4478206a2197d5e Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Mon, 20 Oct 2025 02:36:47 +0800 Subject: [PATCH 10/17] Fix hegvd to prevent B from being overwritten by Cholesky --- .../module_container/ATen/kernels/cuda/lapack.cu | 15 +++++++++++++-- .../module_container/ATen/kernels/lapack.cpp | 5 ++++- .../module_container/ATen/kernels/lapack.h | 3 ++- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu index 34fca37898..cbd10a67a2 100644 --- a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu +++ b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu @@ -147,11 +147,19 @@ struct lapack_hegvd { { const int itype = 1; const char jobz = 'V'; - const char uplo = 'U'; + const char uplo = 'L'; cudaErrcheck(cudaMemcpy(eigen_vec, Mat_A, sizeof(T) * dim * lda, cudaMemcpyDeviceToDevice)); + + // prevent B from being overwritten by Cholesky + T *d_B_backup = nullptr; + cudaErrcheck(cudaMalloc(&d_B_backup, sizeof(T) * dim * lda)); + cudaErrcheck(cudaMemcpy(d_B_backup, Mat_B, sizeof(T) * dim * lda, cudaMemcpyDeviceToDevice)); + cuSolverConnector::hegvd(cusolver_handle, itype, jobz, uplo, dim, - eigen_vec, lda, Mat_B, lda, + eigen_vec, lda, + d_B_backup, lda, eigen_val); + cudaErrcheck(cudaFree(d_B_backup)); } }; @@ -172,6 +180,9 @@ struct lapack_hegvx { const char range = 'I'; const char uplo = 'U'; int meig = 0; + + // this hegvdx will protect the input A, B from being overwritten + // and write the eigenvectors into eigen_vec. cuSolverConnector::hegvdx(cusolver_handle, itype, jobz, range, uplo, n, lda, A, B, diff --git a/source/source_base/module_container/ATen/kernels/lapack.cpp b/source/source_base/module_container/ATen/kernels/lapack.cpp index 185707ff8f..226710810d 100644 --- a/source/source_base/module_container/ATen/kernels/lapack.cpp +++ b/source/source_base/module_container/ATen/kernels/lapack.cpp @@ -215,6 +215,9 @@ struct lapack_hegvd { // eigen_vec = Mat_A std::copy(Mat_A, Mat_A + dim*lda, eigen_vec); + Tensor aux_B(DataTypeToEnum::value, DeviceType::CpuDevice, {dim * lda}); + std::copy(Mat_B, Mat_B + dim * lda, aux_B.data()); + const int itype = 1; const char jobz = 'V'; const char uplo = 'L'; @@ -232,7 +235,7 @@ struct lapack_hegvd { iwork.zero(); // After this, eigen_vec will contain the matrix Z of eigenvectors - lapackConnector::hegvd(itype, jobz, uplo, dim, eigen_vec, lda, Mat_B, lda, eigen_val, work.data(), lwork, rwork.data(), lrwork, iwork.data(), liwork, info); + lapackConnector::hegvd(itype, jobz, uplo, dim, eigen_vec, lda, aux_B.data(), lda, eigen_val, work.data(), lwork, rwork.data(), lrwork, iwork.data(), liwork, info); if (info != 0) { throw std::runtime_error("hegvd failed with info = " + std::to_string(info)); } diff --git a/source/source_base/module_container/ATen/kernels/lapack.h b/source/source_base/module_container/ATen/kernels/lapack.h index 2914ee9fe6..8ffc2631a4 100644 --- a/source/source_base/module_container/ATen/kernels/lapack.h +++ b/source/source_base/module_container/ATen/kernels/lapack.h @@ -76,7 +76,7 @@ struct lapack_heevx { * * @note * See LAPACK ZHEEVX or CHEEVX documentation for more details. - * + * This routine allocates auxiliary memory inside to prevent input matrix from being destroyed. */ void operator()( const int dim, @@ -137,6 +137,7 @@ struct lapack_hegvx { * * @note * See LAPACK ZHEGVX doc for more details. + * This routine allocates auxiliary memory inside to prevent input matrix from being destroyed. */ void operator()( const int n, From 569a79cfa0ea0030b66d7119030224d8c44c7714 Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Mon, 20 Oct 2025 11:45:14 +0800 Subject: [PATCH 11/17] Switch on ATen/kernels/test --- source/source_base/module_container/ATen/kernels/lapack.h | 1 + source/source_base/module_container/CMakeLists.txt | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/source/source_base/module_container/ATen/kernels/lapack.h b/source/source_base/module_container/ATen/kernels/lapack.h index 8ffc2631a4..b570bc122b 100644 --- a/source/source_base/module_container/ATen/kernels/lapack.h +++ b/source/source_base/module_container/ATen/kernels/lapack.h @@ -105,6 +105,7 @@ struct lapack_hegvd { * @note * See LAPACK ZHEGVD or CHEGVD documentation for more details. * This function assumes that A and B have the same leading dimensions, lda. + * This function copies B to auxiliary memory to avoid being overwritten. */ void operator()( const int n, diff --git a/source/source_base/module_container/CMakeLists.txt b/source/source_base/module_container/CMakeLists.txt index 80a8dce5fc..4a2be0ef29 100644 --- a/source/source_base/module_container/CMakeLists.txt +++ b/source/source_base/module_container/CMakeLists.txt @@ -21,7 +21,7 @@ endif() add_library(container STATIC ${ATen_CPU_SRCS} ${ATen_CUDA_SRCS}) -target_link_libraries(container PUBLIC +target_link_libraries(container PUBLIC ${ATen_CPU_DEPENDENCY_LIBS} ${ATen_CUDA_DEPENDENCY_LIBS} ${ATen_ROCM_DEPENDENCY_LIBS}) if(ENABLE_COVERAGE) @@ -31,5 +31,6 @@ endif() if(BUILD_TESTING) if(ENABLE_MPI) add_subdirectory(test) + add_subdirectory(ATen/kernels/test) endif() endif() From abf8340bbf2e96fc1afa2f36a0c78b2876ed336b Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Mon, 20 Oct 2025 15:12:06 +0800 Subject: [PATCH 12/17] Change container test TARGET name to be auto-run by CI --- .../module_container/ATen/kernels/CMakeLists.txt | 2 +- .../module_container/ATen/kernels/test/CMakeLists.txt | 4 ++-- .../module_container/ATen/ops/test/CMakeLists.txt | 6 +++--- source/source_base/module_container/CMakeLists.txt | 1 - 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/source/source_base/module_container/ATen/kernels/CMakeLists.txt b/source/source_base/module_container/ATen/kernels/CMakeLists.txt index 92f7a9fccc..9faecec778 100644 --- a/source/source_base/module_container/ATen/kernels/CMakeLists.txt +++ b/source/source_base/module_container/ATen/kernels/CMakeLists.txt @@ -16,4 +16,4 @@ if(BUILD_TESTING) if(ENABLE_MPI) add_subdirectory(test) endif() -endif() \ No newline at end of file +endif() diff --git a/source/source_base/module_container/ATen/kernels/test/CMakeLists.txt b/source/source_base/module_container/ATen/kernels/test/CMakeLists.txt index 2e07d84e6a..0ca3d97c26 100644 --- a/source/source_base/module_container/ATen/kernels/test/CMakeLists.txt +++ b/source/source_base/module_container/ATen/kernels/test/CMakeLists.txt @@ -1,8 +1,8 @@ AddTest( - TARGET container_kernels_uts + TARGET MODULE_BASE_container_kernels_uts LIBS parameter ${math_libs} SOURCES blas_test.cpp lapack_test.cpp memory_test.cpp linalg_test.cpp ) -target_link_libraries(container_kernels_uts container base device) \ No newline at end of file +target_link_libraries(MODULE_BASE_container_kernels_uts container base device) diff --git a/source/source_base/module_container/ATen/ops/test/CMakeLists.txt b/source/source_base/module_container/ATen/ops/test/CMakeLists.txt index 0cc2c090ca..d5103acfdd 100644 --- a/source/source_base/module_container/ATen/ops/test/CMakeLists.txt +++ b/source/source_base/module_container/ATen/ops/test/CMakeLists.txt @@ -1,7 +1,7 @@ AddTest( - TARGET container_ops_uts - LIBS parameter ${math_libs} + TARGET MODULE_BASE_container_ops_uts + LIBS parameter ${math_libs} SOURCES einsum_op_test.cpp linalg_op_test.cpp ../../kernels/lapack.cpp ) -target_link_libraries(container_ops_uts container base device) \ No newline at end of file +target_link_libraries(MODULE_BASE_container_ops_uts container base device) diff --git a/source/source_base/module_container/CMakeLists.txt b/source/source_base/module_container/CMakeLists.txt index 4a2be0ef29..14b5c57c52 100644 --- a/source/source_base/module_container/CMakeLists.txt +++ b/source/source_base/module_container/CMakeLists.txt @@ -31,6 +31,5 @@ endif() if(BUILD_TESTING) if(ENABLE_MPI) add_subdirectory(test) - add_subdirectory(ATen/kernels/test) endif() endif() From 5f05bf434e385816d64c66318cf1ef135b1e5b07 Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Tue, 21 Oct 2025 01:29:49 +0800 Subject: [PATCH 13/17] Add test for heevx --- .../ATen/kernels/cuda/lapack.cu | 4 ++ .../ATen/kernels/test/lapack_test.cpp | 57 ++++++++++++++++++- 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu index cbd10a67a2..96b24f243a 100644 --- a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu +++ b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu @@ -6,6 +6,9 @@ #include #include +#include + + namespace container { namespace kernels { @@ -112,6 +115,7 @@ struct lapack_heevx { Real *d_eigen_val, T *d_eigen_vec) { + assert(n <= lda); // copy d_Mat to d_eigen_vec, and results will be overwritten into d_eigen_vec // by cuSolver cudaErrcheck(cudaMemcpy(d_eigen_vec, d_Mat, sizeof(T) * n * lda, cudaMemcpyDeviceToDevice)); diff --git a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp index 8027d89f82..3348ecda3e 100644 --- a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp +++ b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp @@ -138,6 +138,59 @@ TYPED_TEST(LapackTest, heevd) { EXPECT_EQ(expected_C1, expected_C2); } +TYPED_TEST(LapackTest, heevx) { + using Type = typename std::tuple_element<0, decltype(TypeParam())>::type; + using Real = typename GetTypeReal::type; + using Device = typename std::tuple_element<1, decltype(TypeParam())>::type; + + blas_gemm gemmCalculator; + blas_axpy axpyCalculator; + lapack_heevx heevxCalculator; + + const int dim = 3; + const int neig = 2; // Compute first 2 eigenvalues + + Tensor A = std::move(Tensor({static_cast(4.0), static_cast(1.0), static_cast(1.0), + static_cast(1.0), static_cast(5.0), static_cast(3.0), + static_cast(1.0), static_cast(3.0), static_cast(6.0)}).to_device()); + + Tensor E = std::move(Tensor({static_cast(0.0), static_cast(0.0)}).to_device()); + Tensor V = A; + Tensor expected_C1 = std::move(Tensor({static_cast(0.0), static_cast(0.0), static_cast(0.0), + static_cast(0.0), static_cast(0.0), static_cast(0.0)}).to_device()); + Tensor expected_C2 = expected_C1; + expected_C1.zero(); + expected_C2.zero(); + + const char trans = 'N'; + const int m = 3; + const int n = neig; + const int k = 3; + const Type alpha = static_cast(1.0); + const Type beta = static_cast(0.0); + + // Compute first neig eigenvalues and eigenvectors using heevx + heevxCalculator(dim, dim, A.data(), neig, E.data(), V.data()); + + E = E.to_device(); + const Tensor Alpha = std::move(Tensor({ + static_cast(E.data()[0]), + static_cast(E.data()[1])})); + + // Check the eigenvalues and eigenvectors + // A * x = lambda * x for the first neig eigenvectors + // get A*V + gemmCalculator(trans, trans, m, n, k, &alpha, A.data(), m, V.data(), k, &beta, expected_C1.data(), m); + // get E*V + for (int ii = 0; ii < neig; ii++) { + axpyCalculator(dim, Alpha.data() + ii, V.data() + ii * dim, 1, expected_C2.data() + ii * dim, 1); + } + // check that A*V = E*V + E = E.to_device(); + V = V.to_device(); + + EXPECT_EQ(expected_C1, expected_C2); +} TYPED_TEST(LapackTest, hegvd) { using Type = typename std::tuple_element<0, decltype(TypeParam())>::type; @@ -189,5 +242,7 @@ TYPED_TEST(LapackTest, hegvd) { EXPECT_EQ(expected_C1, expected_C2); } -} // namespace op + + +} // namespace kernels } // namespace container From 11c19c89d44987eb530f3ac55c788d82465da797 Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Tue, 21 Oct 2025 01:49:26 +0800 Subject: [PATCH 14/17] Add test for hegvx --- .../ATen/kernels/test/lapack_test.cpp | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp index 3348ecda3e..d2597024ab 100644 --- a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp +++ b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp @@ -188,6 +188,19 @@ TYPED_TEST(LapackTest, heevx) { // check that A*V = E*V E = E.to_device(); V = V.to_device(); + std::cout << "Eigenvalues E: "; + for (int i = 0; i < neig; i++) { + std::cout << E.data()[i] << " "; + } + std::cout << std::endl; + + std::cout << "Eigenvectors V:" << std::endl; + for (int i = 0; i < dim; i++) { + for (int j = 0; j < neig; j++) { + std::cout << V.data()[i + j * dim] << " "; + } + std::cout << std::endl; + } EXPECT_EQ(expected_C1, expected_C2); } @@ -242,7 +255,69 @@ TYPED_TEST(LapackTest, hegvd) { EXPECT_EQ(expected_C1, expected_C2); } +TYPED_TEST(LapackTest, hegvx) { + using Type = typename std::tuple_element<0, decltype(TypeParam())>::type; + using Real = typename GetTypeReal::type; + using Device = typename std::tuple_element<1, decltype(TypeParam())>::type; + blas_gemm gemmCalculator; + blas_axpy axpyCalculator; + lapack_hegvx hegvxCalculator; + + const int dim = 3; + const int neig = 2; // Compute first 2 eigenvalues + + Tensor A = std::move(Tensor({static_cast(4.0), static_cast(1.0), static_cast(1.0), + static_cast(1.0), static_cast(5.0), static_cast(3.0), + static_cast(1.0), static_cast(3.0), static_cast(6.0)}).to_device()); + + Tensor B = std::move(Tensor({static_cast(2.0), static_cast(0.0), static_cast(0.0), + static_cast(0.0), static_cast(2.0), static_cast(0.0), + static_cast(0.0), static_cast(0.0), static_cast(2.0)}).to_device()); + + Tensor E = std::move(Tensor({static_cast(0.0), static_cast(0.0)}).to_device()); + Tensor V = A; + Tensor expected_C1 = std::move(Tensor({static_cast(0.0), static_cast(0.0), static_cast(0.0), + static_cast(0.0), static_cast(0.0), static_cast(0.0)}).to_device()); + Tensor expected_C2 = expected_C1; + Tensor C_temp = expected_C1; + expected_C1.zero(); + expected_C2.zero(); + + const char trans = 'N'; + const int m = 3; + const int n = neig; + const int k = 3; + const Type alpha = static_cast(1.0); + const Type beta = static_cast(0.0); + + // Compute first neig eigenvalues and eigenvectors using hegvx + hegvxCalculator(dim, dim, A.data(), B.data(), neig, E.data(), V.data()); + + E = E.to_device(); + const Tensor Alpha = std::move(Tensor({ + static_cast(E.data()[0]), + static_cast(E.data()[1])})); + + // Check the eigenvalues and eigenvectors + // A * x = lambda * B * x for the first neig eigenvectors + // get A*V + gemmCalculator(trans, trans, m, n, k, &alpha, A.data(), m, V.data(), k, &beta, expected_C1.data(), m); + // get E * B * V + // where B is 2 * eye(3,3) + // get C_temp = B * V first + gemmCalculator(trans, trans, m, n, k, &alpha, B.data(), m, V.data(), k, &beta, C_temp.data(), m); + // then compute C2 = E * B * V + for (int ii = 0; ii < neig; ii++) { + axpyCalculator(dim, Alpha.data() + ii, C_temp.data() + ii * dim, 1, expected_C2.data() + ii * dim, 1); + } + // check that A*V = E*V + E = E.to_device(); + V = V.to_device(); + + + EXPECT_EQ(expected_C1, expected_C2); +} } // namespace kernels } // namespace container From 915cc10e489d27557bc13dc8c289dbd263c74af1 Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Tue, 21 Oct 2025 02:16:07 +0800 Subject: [PATCH 15/17] Remove test output code --- .../ATen/kernels/test/lapack_test.cpp | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp index d2597024ab..a10c3ef883 100644 --- a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp +++ b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp @@ -188,19 +188,6 @@ TYPED_TEST(LapackTest, heevx) { // check that A*V = E*V E = E.to_device(); V = V.to_device(); - std::cout << "Eigenvalues E: "; - for (int i = 0; i < neig; i++) { - std::cout << E.data()[i] << " "; - } - std::cout << std::endl; - - std::cout << "Eigenvectors V:" << std::endl; - for (int i = 0; i < dim; i++) { - for (int j = 0; j < neig; j++) { - std::cout << V.data()[i + j * dim] << " "; - } - std::cout << std::endl; - } EXPECT_EQ(expected_C1, expected_C2); } From 1ae41a12f52829a3fe2d1232400a5b83ab35788d Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Tue, 21 Oct 2025 10:40:56 +0800 Subject: [PATCH 16/17] Clean the code and add docs --- .../module_container/ATen/kernels/lapack.cpp | 5 -- .../module_container/ATen/kernels/lapack.h | 47 ++++++++++++++----- .../ATen/kernels/test/lapack_test.cpp | 22 ++++----- 3 files changed, 46 insertions(+), 28 deletions(-) diff --git a/source/source_base/module_container/ATen/kernels/lapack.cpp b/source/source_base/module_container/ATen/kernels/lapack.cpp index 226710810d..0c3e72d76c 100644 --- a/source/source_base/module_container/ATen/kernels/lapack.cpp +++ b/source/source_base/module_container/ATen/kernels/lapack.cpp @@ -3,7 +3,6 @@ #include -// #include // std::memcpy #include // std::copy #include #include @@ -208,10 +207,6 @@ struct lapack_hegvd { // first copy Mat_A to eigen_vec // then pass as argument "A" in lapack hegvd // and this block of memory will be overwritten by eigenvectors - // for (int i = 0; i < dim * lda; ++i){ - // eigen_vec[i] = Mat_A[i]; - // } - // std::memcpy(eigen_vec, Mat_A, sizeof(T) * dim * lda); // eigen_vec = Mat_A std::copy(Mat_A, Mat_A + dim*lda, eigen_vec); diff --git a/source/source_base/module_container/ATen/kernels/lapack.h b/source/source_base/module_container/ATen/kernels/lapack.h index b570bc122b..b3a4a40c4e 100644 --- a/source/source_base/module_container/ATen/kernels/lapack.h +++ b/source/source_base/module_container/ATen/kernels/lapack.h @@ -40,7 +40,18 @@ struct lapack_potrf { const int& lda); }; - +// ============================================================================ +// Standard Hermitian Eigenvalue Problem Solvers +// ============================================================================ +// The following structures (lapack_heevd and lapack_heevx) implement solvers +// for standard Hermitian eigenvalue problems of the form: +// A * x = lambda * x +// where: +// - A is a Hermitian matrix +// - lambda are the eigenvalues to be computed +// - x are the corresponding eigenvectors +// +// ============================================================================ template struct lapack_heevd { using Real = typename GetTypeReal::type; @@ -61,18 +72,15 @@ struct lapack_heevx { * This function solves the problem A*x = lambda*x, where A is a Hermitian matrix. * It computes a subset of eigenvalues and, optionally, the corresponding eigenvectors. * - * @param jobz 'N': Compute eigenvalues only; 'V': Compute eigenvalues and eigenvectors. - * @param range 'A': All eigenvalues; 'V': Eigenvalues in the half-open interval (vl, vu]; 'I': Eigenvalues with indices il through iu. - * @param uplo 'U': Upper triangle of A is stored; 'L': Lower triangle is stored. * @param dim The order of the matrix A. dim >= 0. - * @param Mat On entry, the Hermitian matrix A. On exit, it may be overwritten. - * @param vl Lower bound of the interval to search for eigenvalues if range == 'V'. - * @param vu Upper bound of the interval to search for eigenvalues if range == 'V'. - * @param il Index of the smallest eigenvalue to be returned if range == 'I'. - * @param iu Index of the largest eigenvalue to be returned if range == 'I'. - * @param m Output: The total number of found eigenvalues. - * @param eigen_val Array to store the computed eigenvalues in ascending order. - * @param eigen_vec If not nullptr and jobz == 'V', array to store the computed eigenvectors. + * @param lda The leading dimension of the array Mat. lda >= max(1, dim). + * @param Mat On entry, the Hermitian matrix A. On exit, A is kept. + * @param neig The number of eigenvalues to be found. 0 <= neig <= dim. + * @param eigen_val On normal exit, the first \p neig elements contain the selected + * eigenvalues in ascending order. + * @param eigen_vec If eigen_vec is not nullptr, then on exit it contains the + * orthonormal eigenvectors of the matrix A. The eigenvectors are stored in + * the columns of eigen_vec, in the same order as the eigenvalues. * * @note * See LAPACK ZHEEVX or CHEEVX documentation for more details. @@ -87,6 +95,21 @@ struct lapack_heevx { T *eigen_vec); }; + +// ============================================================================ +// Generalized Hermitian-definite Eigenvalue Problem Solvers +// ============================================================================ +// The following structures (lapack_hegvd and lapack_hegvx) implement solvers +// for generalized Hermitian-definite eigenvalue problems of the form: +// A * x = lambda * B * x +// where: +// - A is a Hermitian matrix +// - B is a Hermitian positive definite matrix +// - lambda are the eigenvalues to be computed +// - x are the corresponding eigenvectors +// +// ============================================================================ + template struct lapack_hegvd { using Real = typename GetTypeReal::type; diff --git a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp index a10c3ef883..277f6a33b8 100644 --- a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp +++ b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp @@ -92,6 +92,9 @@ TYPED_TEST(LapackTest, Potrf) { EXPECT_EQ(A, C); } +// Test for lapack_heevd and lapack_heevx: +// Solve a standard eigenvalue problem +// and check that A*V = V*E TYPED_TEST(LapackTest, heevd) { using Type = typename std::tuple_element<0, decltype(TypeParam())>::type; using Real = typename GetTypeReal::type; @@ -179,19 +182,19 @@ TYPED_TEST(LapackTest, heevx) { // Check the eigenvalues and eigenvectors // A * x = lambda * x for the first neig eigenvectors - // get A*V + // check that A * V = V * E + // get A * V gemmCalculator(trans, trans, m, n, k, &alpha, A.data(), m, V.data(), k, &beta, expected_C1.data(), m); - // get E*V + // get V * E for (int ii = 0; ii < neig; ii++) { axpyCalculator(dim, Alpha.data() + ii, V.data() + ii * dim, 1, expected_C2.data() + ii * dim, 1); - } - // check that A*V = E*V - E = E.to_device(); - V = V.to_device(); EXPECT_EQ(expected_C1, expected_C2); } +// Test for lapack_hegvd and lapack_hegvx +// Solve a generalized eigenvalue problem +// and check that A * v = e * B * v TYPED_TEST(LapackTest, hegvd) { using Type = typename std::tuple_element<0, decltype(TypeParam())>::type; using Real = typename GetTypeReal::type; @@ -288,7 +291,8 @@ TYPED_TEST(LapackTest, hegvx) { // Check the eigenvalues and eigenvectors // A * x = lambda * B * x for the first neig eigenvectors - // get A*V + // check that A * V = E * B * V + // get A * V gemmCalculator(trans, trans, m, n, k, &alpha, A.data(), m, V.data(), k, &beta, expected_C1.data(), m); // get E * B * V // where B is 2 * eye(3,3) @@ -298,10 +302,6 @@ TYPED_TEST(LapackTest, hegvx) { for (int ii = 0; ii < neig; ii++) { axpyCalculator(dim, Alpha.data() + ii, C_temp.data() + ii * dim, 1, expected_C2.data() + ii * dim, 1); } - // check that A*V = E*V - E = E.to_device(); - V = V.to_device(); - EXPECT_EQ(expected_C1, expected_C2); } From ac973429b3b8e13481b7e643d8430447a41dcf50 Mon Sep 17 00:00:00 2001 From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com> Date: Tue, 21 Oct 2025 10:56:12 +0800 Subject: [PATCH 17/17] Fix mismatched parentheses --- .../module_container/ATen/kernels/test/lapack_test.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp index 277f6a33b8..acd903bb60 100644 --- a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp +++ b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp @@ -188,6 +188,7 @@ TYPED_TEST(LapackTest, heevx) { // get V * E for (int ii = 0; ii < neig; ii++) { axpyCalculator(dim, Alpha.data() + ii, V.data() + ii * dim, 1, expected_C2.data() + ii * dim, 1); + } EXPECT_EQ(expected_C1, expected_C2); }