From 5e7cf0deff900d79c179456a509b9436684205e0 Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Fri, 17 Oct 2025 09:47:48 +0800
Subject: [PATCH 01/17] Update lapack_hegvd interface to support lda different
 than n

---
 .../ATen/kernels/cuda/lapack.cu               |  31 ++-
 .../module_container/ATen/kernels/lapack.cpp  | 118 +++++++-
 .../module_container/ATen/kernels/lapack.h    |  66 ++++-
 .../ATen/kernels/rocm/lapack.hip.cu           |  23 +-
 .../ATen/kernels/test/lapack_test.cpp         |  34 +--
 .../base/third_party/cusolver.h               | 260 ++++++++++++++++--
 source/source_hsolver/kernels/hegvd_op.cpp    |   5 +-
 7 files changed, 452 insertions(+), 85 deletions(-)
diff --git a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
index fcdaa7e70a..33e1eef7f5 100644
--- a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
+++ b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
@@ -105,18 +105,37 @@ template <typename T>
 struct lapack_hegvd<T, DEVICE_GPU> {
     using Real = typename GetTypeReal<T>::type;
     void operator()(
-        const int& itype,
-        const char& jobz,
-        const char& uplo,
+        const int dim,
+        const int lda,
         T* Mat_A,
         T* Mat_B,
-        const int& dim,
-        Real* eigen_val)
+        Real* eigen_val,
+        T *eigen_vec)
     {
-        cuSolverConnector::hegvd(cusolver_handle, itype, jobz, uplo, dim, Mat_A, dim, Mat_B, dim, eigen_val);
+        const int itype = 1;
+        const char jobz = 'V';
+        const char uplo = 'U';
+        cudaErrcheck(cudaMemcpy(eigen_vec, Mat_A, sizeof(T) * dim * lda, cudaMemcpyDeviceToDevice));
+        cuSolverConnector::hegvd(cusolver_handle, itype, jobz, uplo, dim, eigen_vec, lda, Mat_B, lda, eigen_val);
     }
 };
 
+// template <typename T>
+// struct lapack_hegvx<T, DEVICE_GPU> {
+//     using Real = typename GetTypeReal<T>::type;
+//     void operator()(
+//         const int n,
+//         const int lda,
+//         T *A,
+//         T *B,
+//         const int m,
+//         Real *eigen_val,
+//         T *eigen_vec)
+//     {
+//         cuSolverConnector::hegvx(cusolver_handle, n, lda, A, B, m, eigen_val, eigen_vec);
+//     }
+// };
+//
 template <typename T>
 struct lapack_getrf<T, DEVICE_GPU> {
     void operator()(
diff --git a/source/source_base/module_container/ATen/kernels/lapack.cpp b/source/source_base/module_container/ATen/kernels/lapack.cpp
index 21cbc46f96..90a3c5a0b6 100644
--- a/source/source_base/module_container/ATen/kernels/lapack.cpp
+++ b/source/source_base/module_container/ATen/kernels/lapack.cpp
@@ -2,6 +2,9 @@
 
 #include <base/third_party/lapack.h>
 
+// #include <cstring> // std::memcpy
+#include <algorithm> // std::copy
+
 namespace container {
 namespace kernels {
 
@@ -36,7 +39,7 @@ struct lapack_trtri<T, DEVICE_CPU> {
         const char& diag,
         const int& dim,
         T* Mat,
-        const int& lda) 
+        const int& lda)
     {
         int info = 0;
         lapackConnector::trtri(uplo, diag, dim, Mat, lda, info);
@@ -51,8 +54,8 @@ struct lapack_potrf<T, DEVICE_CPU> {
     void operator()(
         const char& uplo,
         const int& dim,
-        T* Mat, 
-        const int& lda) 
+        T* Mat,
+        const int& lda)
     {
         int info = 0;
         lapackConnector::potrf(uplo, dim, Mat, dim, info);
@@ -85,7 +88,7 @@ struct lapack_heevd<T, DEVICE_CPU> {
         Tensor iwork(DataTypeToEnum<int>::value, DeviceType::CpuDevice, {liwork});
         iwork.zero();
 
-        lapackConnector::heevd(jobz, uplo, dim, Mat, dim, eigen_val,  work.data<T>(), lwork, rwork.data<Real>(), lrwork, iwork.data<int>(), liwork, info);
+        lapackConnector::heevd(jobz, uplo, dim, Mat, dim, eigen_val, work.data<T>(), lwork, rwork.data<Real>(), lrwork, iwork.data<int>(), liwork, info);
         if (info != 0) {
             throw std::runtime_error("heevd failed with info = " + std::to_string(info));
         }
@@ -96,14 +99,26 @@ template <typename T>
 struct lapack_hegvd<T, DEVICE_CPU> {
     using Real = typename GetTypeReal<T>::type;
     void operator()(
-        const int& itype,
-        const char& jobz,
-        const char& uplo,
-        T* Mat_A,
-        T* Mat_B,
-        const int& dim,
-        Real* eigen_val)
+        const int dim,
+        const int lda,
+        T *Mat_A,
+        T *Mat_B,
+        Real *eigen_val,
+        T *eigen_vec)
     {
+        // first copy Mat_A to eigen_vec
+        // then pass as argument "A" in lapack hegvd
+        // and this block of memory will be overwritten by eigenvectors
+        // for (int i = 0; i < dim * lda; ++i){
+        //     eigen_vec[i] = Mat_A[i];
+        // }
+        // std::memcpy(eigen_vec, Mat_A, sizeof(T) * dim * lda);
+        // eigen_vec = Mat_A
+        std::copy(Mat_A, Mat_A + dim*lda, eigen_vec);
+
+        const int itype = 1;
+        const char jobz = 'V';
+        const char uplo = 'U';
         int info = 0;
         int lwork = std::max(2 * dim + dim * dim, 1 + 6 * dim + 2 * dim * dim);
         Tensor work(DataTypeToEnum<T>::value, DeviceType::CpuDevice, {lwork});
@@ -117,13 +132,90 @@ struct lapack_hegvd<T, DEVICE_CPU> {
         Tensor iwork(DataType::DT_INT, DeviceType::CpuDevice, {liwork});
         iwork.zero();
 
-        lapackConnector::hegvd(itype, jobz, uplo, dim, Mat_A, dim, Mat_B, dim, eigen_val, work.data<T>(), lwork, rwork.data<Real>(), lrwork, iwork.data<int>(), liwork, info);
+        // After this, eigen_vec will contain the matrix Z of eigenvectors
+        lapackConnector::hegvd(itype, jobz, uplo, dim, eigen_vec, lda, Mat_B, lda, eigen_val, work.data<T>(), lwork, rwork.data<Real>(), lrwork, iwork.data<int>(), liwork, info);
         if (info != 0) {
             throw std::runtime_error("hegvd failed with info = " + std::to_string(info));
         }
     }
 };
 
+
+// template <typename T>
+// struct lapack_hegvx<T, DEVICE_CPU> {
+//     using Real = typename GetTypeReal<T>::type;
+//     void operator()(
+//         const int n,
+//         const int lda,
+//         T *A,
+//         T *B,
+//         const int m,
+//         Real *eigen_val,
+//         T *eigen_vec)
+//     {
+//         int info = 0;
+
+//         int mm = m;
+
+//         int lwork = -1;
+
+//         T *work = new T[1];
+//         Real *rwork = new Real[7 * n];
+//         int *iwork = new int[5 * n];
+//         int *ifail = new int[n];
+
+//         // set lwork = -1 to query optimal work size
+//         lapackConnector::hegvx(1, // ITYPE = 1:  A*x = (lambda)*B*x
+//             'V', 'I', 'U',
+//             n, A, lda, B, lda,
+//             0.0, 0.0,
+//             1, m, 0.0, mm,
+//             eigen_val, eigen_vec, lda,
+//             work,
+//             lwork,      // lwork = 1, query optimal size.
+//             rwork, iwork, ifail,
+//             info);
+
+//         // !>  If LWORK = -1, then a workspace query is assumed; the routine
+//         // !>  only calculates the optimal size of the WORK array, returns
+//         // !>  this value as the first entry of the WORK array.
+//         lwork = int(get_real(work[0]));
+//         delete[] work;
+//         work = new T[lwork];
+
+//         lapackConnector::hegvx(
+//             1, // ITYPE = 1:  A*x = (lambda)*B*x
+//             'V',    // JOBZ = 'V':  Compute eigenvalues and eigenvectors.
+//             'I',    // RANGE = 'I': the IL-th through IU-th eigenvalues will be found.
+//             'U',    // UPLO = 'U':  Upper triangles of A and B are stored.
+//             n,      // order of the matrices A and B.
+//             A,      // A is COMPLEX*16 array  dimension (LDA, N)
+//             lda,    // leading dimension of the array A.
+//             B,      // B is COMPLEX*16 array, dimension (LDB, N)
+//             lda,    // assume that leading dimension of B is the same as A.
+//             0.0,    // VL, Not referenced if RANGE = 'A' or 'I'.
+//             0.0,    // VU, Not referenced if RANGE = 'A' or 'I'.
+//             1,      // IL: If RANGE='I', the index of the smallest eigenvalue to be returned. 1 <= IL <= IU <= N,
+//             m,      // IU: If RANGE='I', the index of the largest eigenvalue to be returned. 1 <= IL <= IU <= N,
+//             0.0,    // ABSTOL
+//             mm,     // M: The total number of eigenvalues found.  0 <= M <= N. if RANGE = 'I', M = IU-IL+1.
+//             eigen_val,  // W store eigenvalues
+//             eigen_vec,  // Z store eigenvector
+//             lda,        // LDZ: The leading dimension of the array Z.
+//             work,
+//             lwork,
+//             rwork,
+//             iwork,
+//             ifail,
+//             info);
+
+//         delete[] work;
+//         delete[] rwork;
+//         delete[] iwork;
+//         delete[] ifail;
+//     }
+// };
+
 template <typename T>
 struct lapack_getrf<T, DEVICE_CPU> {
     void operator()(
@@ -220,4 +312,4 @@ template struct lapack_getrs<std::complex<float>, DEVICE_CPU>;
 template struct lapack_getrs<std::complex<double>, DEVICE_CPU>;
 
 } // namespace kernels
-} // namespace container
\ No newline at end of file
+} // namespace container
diff --git a/source/source_base/module_container/ATen/kernels/lapack.h b/source/source_base/module_container/ATen/kernels/lapack.h
index dd3243ac60..5f201769ce 100644
--- a/source/source_base/module_container/ATen/kernels/lapack.h
+++ b/source/source_base/module_container/ATen/kernels/lapack.h
@@ -35,7 +35,7 @@ struct lapack_potrf {
     void operator()(
         const char& uplo,
         const int& dim,
-        T* Mat, 
+        T* Mat,
         const int& lda);
 };
 
@@ -55,16 +55,64 @@ struct lapack_heevd {
 template <typename T, typename Device>
 struct lapack_hegvd {
     using Real = typename GetTypeReal<T>::type;
+    /**
+     * @brief Computes all the eigenvalues and, optionally, the eigenvectors of a complex generalized Hermitian-definite eigenproblem.
+     *
+     * This function solves the problem A*x = lambda*B*x, where A and B are Hermitian matrices, and B is also positive definite.
+     *
+     * @param dim The order of the matrices Mat_A and Mat_B. dim >= 0.
+     * @param lda The leading dimension of the arrays Mat_A and Mat_B. lda >= max(1, dim).
+     * @param Mat_A On entry, the Hermitian matrix A. On exit, it may be overwritten.
+     * @param Mat_B On entry, the Hermitian positive definite matrix B. On exit, it may be overwritten.
+     * @param eigen_val Array to store the computed eigenvalues in ascending order.
+     * @param eigen_vec If not nullptr, array to store the computed eigenvectors.
+     *
+     * @note
+     * See LAPACK ZHEGVD or CHEGVD documentation for more details.
+     * This function assumes that A and B have the same leading dimensions, lda.
+     */
     void operator()(
-        const int& itype,
-        const char& jobz,
-        const char& uplo,
-        T* Mat_A,
-        T* Mat_B,
-        const int& dim,
-        Real* eigen_val);
+        const int dim,
+        const int lda,
+        T *Mat_A,
+        T *Mat_B,
+        Real *eigen_val,
+        T *eigen_vec);
 };
 
+// template <typename T, typename Device>
+// struct lapack_hegvx {
+//     using Real = typename GetTypeReal<T>::type;
+//     /**
+//      * @ brief hegvx computes the first m eigenvalues and their corresponding eigenvectors of
+//      * a complex generalized Hermitian-definite eigenproblem.
+//      *
+//      * In this op, the CPU version is implemented through the `hegvx` interface, and the CUDA version
+//      * is implemented through the `evd` interface and acquires the first m eigenpairs
+//      *
+//      * hegvx 'V' 'I' 'U'  is used to compute the first m eigenpairs of the problem
+//      *
+//      * @param n The order of the matrices A and B. n >= 0.
+//      * @param lda The leading dimension of the array A and B. lda >= max(1, n).
+//      * @param A On entry, the Hermitian matrix A. On exit, if info = 0, A contains the matrix Z of eigenvectors.
+//      * @param B On entry, the Hermitian positive definite matrix B. On exit, the triangular factor from the Cholesky factorization of B.
+//      * @param m The number of eigenvalues and eigenvectors to be found. 0 < m <= n.
+//      * @param eigen_val The first m eigenvalues in ascending order.
+//      * @param eigen_vec The first m columns contain the orthonormal eigenvectors of the matrix A corresponding to the selected eigenvalues.
+//      *
+//      * @note
+//      * See LAPACK ZHEGVX doc for more details.
+//      */
+//     void operator()(
+//         const int n,
+//         const int lda,
+//         T *A,
+//         T *B,
+//         const int m,
+//         Real *eigen_val,
+//         T *eigen_vec);
+// };
+
 
 template <typename T, typename Device>
 struct lapack_getrf {
@@ -110,4 +158,4 @@ void destroyGpuSolverHandle(); // destroy cusolver handle
 } // namespace container
 } // namespace kernels
 
-#endif // ATEN_KERNELS_LAPACK_H_
\ No newline at end of file
+#endif // ATEN_KERNELS_LAPACK_H_
diff --git a/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu b/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu
index 64040cb79c..93f0fa481b 100644
--- a/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu
+++ b/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu
@@ -28,8 +28,8 @@ void destroyGpuSolverHandle() {
 template <typename T>
 __global__ void set_matrix_kernel(
     const char uplo,
-    T* A, 
-    const int dim) 
+    T* A,
+    const int dim)
 {
     int bid = blockIdx.x;
     int tid = threadIdx.x;
@@ -64,7 +64,7 @@ struct lapack_trtri<T, DEVICE_GPU> {
         const char& diag,
         const int& dim,
         T* Mat,
-        const int& lda) 
+        const int& lda)
     {
         // TODO: trtri is not implemented in this method yet
         // Cause the trtri in cuSolver is not stable for ABACUS!
@@ -82,8 +82,8 @@ struct lapack_potrf<T, DEVICE_GPU> {
     void operator()(
         const char& uplo,
         const int& dim,
-        T* Mat, 
-        const int& lda) 
+        T* Mat,
+        const int& lda)
     {
         // hipSolverConnector::potrf(hipsolver_handle, uplo, dim, Mat, dim);
         std::vector<T> H_Mat(dim * dim, static_cast<T>(0.0));
@@ -118,15 +118,22 @@ template <typename T>
 struct lapack_hegvd<T, DEVICE_GPU> {
     using Real = typename GetTypeReal<T>::type;
     void operator()(
+        const int dim,
+        const int lda,
         const int& itype,
         const char& jobz,
         const char& uplo,
         T* Mat_A,
         T* Mat_B,
         const int& dim,
-        Real* eigen_val)
+        Real* eigen_val,
+        T *eigen_vec)
     {
-        hipSolverConnector::hegvd(hipsolver_handle, itype, jobz, uplo, dim, Mat_A, dim, Mat_B, dim, eigen_val);
+        const int itype = 1;
+        const char jobz = 'V';
+        const char uplo = 'U';
+        hipErrcheck(hipMemcpy(eigen_vec, Mat_A, sizeof(T) * dim * lda, hipMemcpyDeviceToDevice));
+        hipSolverConnector::hegvd(hipsolver_handle, itype, jobz, uplo, dim, Mat_A, lda, Mat_B, lda, eigen_val);
     }
 };
 
@@ -156,4 +163,4 @@ template struct lapack_hegvd<std::complex<float>,  DEVICE_GPU>;
 template struct lapack_hegvd<std::complex<double>, DEVICE_GPU>;
 
 } // namespace kernels
-} // namespace container
\ No newline at end of file
+} // namespace container
diff --git a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp
index 9675e979c5..8027d89f82 100644
--- a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp
+++ b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp
@@ -33,14 +33,14 @@ TYPED_TEST(LapackTest, Trtri) {
     Tensor A = std::move(Tensor({static_cast<Type>(1.0), static_cast<Type>(2.0), static_cast<Type>(3.0),
                                  static_cast<Type>(0.0), static_cast<Type>(4.0), static_cast<Type>(5.0),
                                  static_cast<Type>(0.0), static_cast<Type>(0.0), static_cast<Type>(6.0)}).to_device<Device>());
-    
+
     Tensor I = std::move(Tensor({static_cast<Type>(1.0), static_cast<Type>(0.0), static_cast<Type>(0.0),
                                  static_cast<Type>(0.0), static_cast<Type>(1.0), static_cast<Type>(0.0),
                                  static_cast<Type>(0.0), static_cast<Type>(0.0), static_cast<Type>(1.0)}).to_device<Device>());
     Tensor B = A;
     Tensor C = B;
     C.zero();
-    
+
     const char trans = 'N';
     const int m = 3;
     const int n = 3;
@@ -51,7 +51,7 @@ TYPED_TEST(LapackTest, Trtri) {
     // For this reason, we should employ 'L' instead of 'U' in the subsequent line.
     trtriCalculator('L', 'N', dim, B.data<Type>(), dim);
     gemmCalculator(trans, trans, m, n, k, &alpha, B.data<Type>(), k, A.data<Type>(), n, &beta, C.data<Type>(), n);
-    
+
     EXPECT_EQ(C, I);
 }
 
@@ -69,11 +69,11 @@ TYPED_TEST(LapackTest, Potrf) {
     Tensor A = std::move(Tensor({static_cast<Type>(4.0), static_cast<Type>(1.0), static_cast<Type>(2.0),
                                  static_cast<Type>(1.0), static_cast<Type>(5.0), static_cast<Type>(3.0),
                                  static_cast<Type>(2.0), static_cast<Type>(3.0), static_cast<Type>(6.0)}).to_device<Device>());
-    
+
     Tensor B = A;
     Tensor C = B;
     C.zero();
-    
+
     const char transa = 'N';
     const char transb = 'C';
     const int m = 3;
@@ -96,7 +96,7 @@ TYPED_TEST(LapackTest, heevd) {
     using Type = typename std::tuple_element<0, decltype(TypeParam())>::type;
     using Real = typename GetTypeReal<Type>::type;
     using Device = typename std::tuple_element<1, decltype(TypeParam())>::type;
-    
+
     blas_gemm<Type, Device> gemmCalculator;
     blas_axpy<Type, Device> axpyCalculator;
     lapack_heevd<Type, Device> heevdCalculator;
@@ -105,14 +105,14 @@ TYPED_TEST(LapackTest, heevd) {
     Tensor A = std::move(Tensor({static_cast<Type>(4.0), static_cast<Type>(1.0), static_cast<Type>(1.0),
                                  static_cast<Type>(1.0), static_cast<Type>(5.0), static_cast<Type>(3.0),
                                  static_cast<Type>(1.0), static_cast<Type>(3.0), static_cast<Type>(6.0)}).to_device<Device>());
-    
+
     Tensor E = std::move(Tensor({static_cast<Real>(0.0), static_cast<Real>(0.0), static_cast<Real>(0.0)}).to_device<Device>());
     Tensor B = A;
     Tensor expected_C1 = A;
     Tensor expected_C2 = A;
     expected_C1.zero();
     expected_C2.zero();
-    
+
     const char trans = 'N';
     const int m = 3;
     const int n = 3;
@@ -122,10 +122,10 @@ TYPED_TEST(LapackTest, heevd) {
     // Note all blas and lapack operators within container are column major!
     // For this reason, we should employ 'L' instead of 'U' in the subsequent line.
     heevdCalculator('V', 'U', B.data<Type>(), dim, E.data<Real>());
-    
+
     E = E.to_device<DEVICE_CPU>();
     const Tensor Alpha = std::move(Tensor({
-            static_cast<Type>(E.data<Real>()[0]), 
+            static_cast<Type>(E.data<Real>()[0]),
             static_cast<Type>(E.data<Real>()[1]),
             static_cast<Type>(E.data<Real>()[2])}));
 
@@ -143,7 +143,7 @@ TYPED_TEST(LapackTest, hegvd) {
     using Type = typename std::tuple_element<0, decltype(TypeParam())>::type;
     using Real = typename GetTypeReal<Type>::type;
     using Device = typename std::tuple_element<1, decltype(TypeParam())>::type;
-    
+
     blas_gemm<Type, Device> gemmCalculator;
     blas_axpy<Type, Device> axpyCalculator;
     lapack_hegvd<Type, Device> hegvdCalculator;
@@ -152,31 +152,31 @@ TYPED_TEST(LapackTest, hegvd) {
     Tensor A = std::move(Tensor({static_cast<Type>(4.0), static_cast<Type>(1.0), static_cast<Type>(1.0),
                                  static_cast<Type>(1.0), static_cast<Type>(5.0), static_cast<Type>(3.0),
                                  static_cast<Type>(1.0), static_cast<Type>(3.0), static_cast<Type>(6.0)}).to_device<Device>());
-    
+
     Tensor I = std::move(Tensor({static_cast<Type>(1.0), static_cast<Type>(0.0), static_cast<Type>(0.0),
                                  static_cast<Type>(0.0), static_cast<Type>(1.0), static_cast<Type>(0.0),
                                  static_cast<Type>(0.0), static_cast<Type>(0.0), static_cast<Type>(1.0)}).to_device<Device>());
-    
+
     Tensor E = std::move(Tensor({static_cast<Real>(0.0), static_cast<Real>(0.0), static_cast<Real>(0.0)}).to_device<Device>());
     Tensor B = A;
     Tensor expected_C1 = A;
     Tensor expected_C2 = A;
     expected_C1.zero();
     expected_C2.zero();
-    
+
     const char trans = 'N';
     const int m = 3;
     const int n = 3;
     const int k = 3;
     const Type alpha = static_cast<Type>(1.0);
     const Type beta  = static_cast<Type>(0.0);
-    // Note al<l blas and lapack operators within container are column major!
+    // Note all blas and lapack operators within container are column major!
     // For this reason, we should employ 'L' instead of 'U' in the subsequent line.
-    hegvdCalculator(1, 'V', 'U', B.data<Type>(), I.data<Type>(), dim, E.data<Real>());
+    hegvdCalculator(dim, dim, A.data<Type>(), I.data<Type>(), E.data<Real>(), B.data<Type>());
 
     E = E.to_device<DEVICE_CPU>();
     const Tensor Alpha = std::move(Tensor({
-            static_cast<Type>(E.data<Real>()[0]), 
+            static_cast<Type>(E.data<Real>()[0]),
             static_cast<Type>(E.data<Real>()[1]),
             static_cast<Type>(E.data<Real>()[2])}));
 
diff --git a/source/source_base/module_container/base/third_party/cusolver.h b/source/source_base/module_container/base/third_party/cusolver.h
index d65aa5a204..6bd137d718 100644
--- a/source/source_base/module_container/base/third_party/cusolver.h
+++ b/source/source_base/module_container/base/third_party/cusolver.h
@@ -145,14 +145,14 @@ static inline
 void heevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, float* A, const int& lda, float * W)
 {
     // prepare some values for cusolverDnSsyevd_bufferSize
-    int lwork  = 0; 
-    int h_info = 0; 
+    int lwork  = 0;
+    int h_info = 0;
     int*   d_info = nullptr;
     float* d_work = nullptr;
     cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
 
     // calculate the sizes needed for pre-allocated buffer.
-    cusolverErrcheck(cusolverDnSsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
+    cusolverErrcheck(cusolverDnSsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
                                 n, A, lda, W, &lwork));
     // allocate memory
     cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
@@ -171,14 +171,14 @@ static inline
 void heevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, double* A, const int& lda, double * W)
 {
     // prepare some values for cusolverDnDsyevd_bufferSize
-    int lwork  = 0; 
-    int h_info = 0; 
+    int lwork  = 0;
+    int h_info = 0;
     int*    d_info = nullptr;
     double* d_work = nullptr;
     cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
 
     // calculate the sizes needed for pre-allocated buffer.
-    cusolverErrcheck(cusolverDnDsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
+    cusolverErrcheck(cusolverDnDsyevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
                                 n, A, lda, W, &lwork));
     // allocate memory
     cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
@@ -197,14 +197,14 @@ static inline
 void heevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, std::complex<float>* A, const int& lda, float * W)
 {
     // prepare some values for cusolverDnCheevd_bufferSize
-    int lwork  = 0; 
-    int h_info = 0; 
+    int lwork  = 0;
+    int h_info = 0;
     int*    d_info = nullptr;
     cuComplex* d_work = nullptr;
     cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
 
     // calculate the sizes needed for pre-allocated buffer.
-    cusolverErrcheck(cusolverDnCheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
+    cusolverErrcheck(cusolverDnCheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
                                 n, reinterpret_cast<cuComplex*>(A), lda, W, &lwork));
     // allocate memory
     cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
@@ -223,14 +223,14 @@ static inline
 void heevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& uplo, const int& n, std::complex<double>* A, const int& lda, double* W)
 {
     // prepare some values for cusolverDnZheevd_bufferSize
-    int lwork  = 0; 
-    int h_info = 0; 
+    int lwork  = 0;
+    int h_info = 0;
     int*    d_info = nullptr;
     cuDoubleComplex* d_work = nullptr;
     cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
 
     // calculate the sizes needed for pre-allocated buffer.
-    cusolverErrcheck(cusolverDnZheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
+    cusolverErrcheck(cusolverDnZheevd_bufferSize(cusolver_handle, cublas_eig_mode(jobz), cublas_fill_mode(uplo),
                                 n, reinterpret_cast<cuDoubleComplex*>(A), lda, W, &lwork));
     // allocate memory
     cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
@@ -250,19 +250,19 @@ static inline
 void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, float* A, const int& lda, float* B, const int& ldb, float * W)
 {
     // prepare some values for cusolverDnSsygvd_bufferSize
-    int lwork  = 0; 
-    int h_info = 0; 
+    int lwork  = 0;
+    int h_info = 0;
     int*   d_info = nullptr;
     float* d_work = nullptr;
     cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
 
     // calculate the sizes needed for pre-allocated buffer.
-    cusolverErrcheck(cusolverDnSsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
+    cusolverErrcheck(cusolverDnSsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
                                 n, A, lda, B, ldb, W, &lwork));
     // allocate memory
     cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
     // compute eigenvalues and eigenvectors.
-    cusolverErrcheck(cusolverDnSsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
+    cusolverErrcheck(cusolverDnSsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
                                 n, A, lda, B, ldb, W, d_work, lwork, d_info));
 
     cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
@@ -276,19 +276,19 @@ static inline
 void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, double* A, const int& lda, double* B, const int& ldb, double * W)
 {
     // prepare some values for cusolverDnDsygvd_bufferSize
-    int lwork  = 0; 
-    int h_info = 0; 
+    int lwork  = 0;
+    int h_info = 0;
     int*   d_info = nullptr;
     double* d_work = nullptr;
     cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
 
     // calculate the sizes needed for pre-allocated buffer.
-    cusolverErrcheck(cusolverDnDsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
+    cusolverErrcheck(cusolverDnDsygvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
                                 n, A, lda, B, ldb, W, &lwork));
     // allocate memory
     cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
     // compute eigenvalues and eigenvectors.
-    cusolverErrcheck(cusolverDnDsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
+    cusolverErrcheck(cusolverDnDsygvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
                                 n, A, lda, B, ldb, W, d_work, lwork, d_info));
 
     cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
@@ -302,19 +302,19 @@ static inline
 void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, std::complex<float>* A, const int& lda, std::complex<float>* B, const int& ldb, float* W)
 {
     // prepare some values for cusolverDnChegvd_bufferSize
-    int lwork  = 0; 
-    int h_info = 0; 
+    int lwork  = 0;
+    int h_info = 0;
     int*   d_info = nullptr;
     cuComplex* d_work = nullptr;
     cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
 
     // calculate the sizes needed for pre-allocated buffer.
-    cusolverErrcheck(cusolverDnChegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
+    cusolverErrcheck(cusolverDnChegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
                                 n, reinterpret_cast<cuComplex*>(A), lda, reinterpret_cast<cuComplex*>(B), ldb, W, &lwork));
     // allocate memory
     cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
     // compute eigenvalues and eigenvectors.
-    cusolverErrcheck(cusolverDnChegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
+    cusolverErrcheck(cusolverDnChegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
                                 n, reinterpret_cast<cuComplex*>(A), lda, reinterpret_cast<cuComplex*>(B), ldb, W, d_work, lwork, d_info));
 
     cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
@@ -328,19 +328,19 @@ static inline
 void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, std::complex<double>* A, const int& lda, std::complex<double>* B, const int& ldb, double* W)
 {
     // prepare some values for cusolverDnZhegvd_bufferSize
-    int lwork  = 0; 
-    int h_info = 0; 
+    int lwork  = 0;
+    int h_info = 0;
     int*   d_info = nullptr;
     cuDoubleComplex* d_work = nullptr;
     cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
 
     // calculate the sizes needed for pre-allocated buffer.
-    cusolverErrcheck(cusolverDnZhegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
+    cusolverErrcheck(cusolverDnZhegvd_bufferSize(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
                                 n, reinterpret_cast<cuDoubleComplex*>(A), lda, reinterpret_cast<cuDoubleComplex*>(B), ldb, W, &lwork));
     // allocate memory
     cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
     // compute eigenvalues and eigenvectors.
-    cusolverErrcheck(cusolverDnZhegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo), 
+    cusolverErrcheck(cusolverDnZhegvd(cusolver_handle, cublas_eig_type(itype), cublas_eig_mode(jobz), cublas_fill_mode(uplo),
                                 n, reinterpret_cast<cuDoubleComplex*>(A), lda, reinterpret_cast<cuDoubleComplex*>(B), ldb, W, d_work, lwork, d_info));
 
     cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
@@ -351,6 +351,208 @@ void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& j
     cudaErrcheck(cudaFree(d_work));
 }
 
+// -----------------------------
+// CUDA hegvx implementations
+// -----------------------------
+
+// static inline
+// void hegvx(cusolverDnHandle_t& cusolver_handle,
+//            const int& itype,
+//            const char& jobz,
+//            const char& uplo,
+//            const int& n,
+//            float* A,
+//            const int& lda,
+//            float* B,
+//            const int& ldb,
+//            const int& il,
+//            const int& iu,
+//            const float& abstol,
+//            int& m_out,
+//            float* W,
+//            float* Z,
+//            const int& ldz) {
+//     // Step 1: Query workspace size
+//     int lwork = 0;
+//     cusolverErrcheck(cusolverDnSsygvdx_bufferSize(
+//         cusolver_handle,
+//         cublas_eig_type(itype),       // itype
+//         cublas_eig_mode(jobz),        // jobz
+//         cublas_fill_mode(uplo),       // uplo
+//         n, A, lda, B, ldb,
+//         il, iu,                       // range: index interval
+//         abstol,                       // tolerance
+//         &m_out,                       // output: number of eigenvalues found
+//         W,                            // eigenvalues
+//         Z, ldz,                       // eigenvectors
+//         &lwork));                      // workspace size
+
+//     // Step 2: Allocate workspace and info
+//     float* d_work = nullptr;
+//     int*   d_info  = nullptr;
+//     cudaErrcheck(cudaMalloc(&d_work, sizeof(float) * lwork));
+//     cudaErrcheck(cudaMalloc(&d_info,  sizeof(int)));
+
+//     // Step 3: Call the solver
+//     cusolverErrcheck(cusolverDnSsygvdx(
+//         cusolver_handle,
+//         cublas_eig_type(itype),
+//         cublas_eig_mode(jobz),
+//         cublas_fill_mode(uplo),
+//         n, A, lda, B, ldb,
+//         il, iu, abstol, &m_out, W, Z, ldz, d_work, lwork, d_info));
+
+//     // Step 4: Check result
+//     int h_info = 0;
+//     cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+//     if (h_info != 0) {
+//         throw std::runtime_error("hegvx: cusolverDnSsygvdx failed with info = " + std::to_string(h_info));
+//     }
+
+//     // Cleanup
+//     cudaErrcheck(cudaFree(d_work));
+//     cudaErrcheck(cudaFree(d_info));
+// }
+
+// Double precision
+// static inline
+// void hegvx(cusolverDnHandle_t& cusolver_handle,
+//            const int& itype,
+//            const char& jobz,
+//            const char& uplo,
+//            const int& n,
+//            double* A,
+//            const int& lda,
+//            double* B,
+//            const int& ldb,
+//            const int& il,
+//            const int& iu,
+//            const double& abstol,
+//            int& m_out,
+//            double* W,
+//            double* Z,
+//            const int& ldz) {
+//     int lwork = 0;
+//     cusolverErrcheck(cusolverDnDsygvdx_bufferSize(
+//         cusolver_handle, itype, jobz, uplo, n, A, lda, B, ldb,
+//         il, iu, abstol, &m_out, W, Z, ldz, &lwork));
+
+//     double* d_work = nullptr;
+//     int*    d_info  = nullptr;
+//     cudaErrcheck(cudaMalloc(&d_work, sizeof(double) * lwork));
+//     cudaErrcheck(cudaMalloc(&d_info,  sizeof(int)));
+
+//     cusolverErrcheck(cusolverDnDsygvdx(
+//         cusolver_handle, itype, jobz, uplo, n, A, lda, B, ldb,
+//         il, iu, abstol, &m_out, W, Z, ldz, d_work, lwork, d_info));
+
+//     int h_info = 0;
+//     cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+//     if (h_info != 0) {
+//         throw std::runtime_error("hegvx: cusolverDnDsygvdx failed with info = " + std::to_string(h_info));
+//     }
+
+//     cudaErrcheck(cudaFree(d_work));
+//     cudaErrcheck(cudaFree(d_info));
+// }
+
+// // Complex single precision
+// static inline
+// void hegvx(cusolverDnHandle_t& cusolver_handle,
+//            const int& itype,
+//            const char& jobz,
+//            const char& uplo,
+//            const int& n,
+//            cuComplex* A,
+//            const int& lda,
+//            cuComplex* B,
+//            const int& ldb,
+//            const int& il,
+//            const int& iu,
+//            const float& abstol,
+//            int& m_out,
+//            float* W,
+//            cuComplex* Z,
+//            const int& ldz) {
+//     int lwork = 0;
+//     cusolverErrcheck(cusolverDnChegvdx_bufferSize(
+//         cusolver_handle, itype, jobz, uplo, n,
+//         reinterpret_cast<cuFloatComplex*>(A), lda,
+//         reinterpret_cast<cuFloatComplex*>(B), ldb,
+//         il, iu, abstol, &m_out, W,
+//         reinterpret_cast<cuFloatComplex*>(Z), ldz, &lwork));
+
+//     cuComplex* d_work = nullptr;
+//     int*       d_info  = nullptr;
+//     cudaErrcheck(cudaMalloc(&d_work, sizeof(cuComplex) * lwork));
+//     cudaErrcheck(cudaMalloc(&d_info,  sizeof(int)));
+
+//     cusolverErrcheck(cusolverDnChegvdx(
+//         cusolver_handle, itype, jobz, uplo, n,
+//         reinterpret_cast<cuFloatComplex*>(A), lda,
+//         reinterpret_cast<cuFloatComplex*>(B), ldb,
+//         il, iu, abstol, &m_out, W,
+//         reinterpret_cast<cuFloatComplex*>(Z), ldz, d_work, lwork, d_info));
+
+//     int h_info = 0;
+//     cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+//     if (h_info != 0) {
+//         throw std::runtime_error("hegvx: cusolverDnChegvdx failed with info = " + std::to_string(h_info));
+//     }
+
+//     cudaErrcheck(cudaFree(d_work));
+//     cudaErrcheck(cudaFree(d_info));
+// }
+
+// // Complex double precision
+// static inline
+// void hegvx(cusolverDnHandle_t& cusolver_handle,
+//            const int& itype,
+//            const char& jobz,
+//            const char& uplo,
+//            const int& n,
+//            cuDoubleComplex* A,
+//            const int& lda,
+//            cuDoubleComplex* B,
+//            const int& ldb,
+//            const int& il,
+//            const int& iu,
+//            const double& abstol,
+//            int& m_out,
+//            double* W,
+//            cuDoubleComplex* Z,
+//            const int& ldz) {
+//     int lwork = 0;
+//     cusolverErrcheck(cusolverDnZhegvdx_bufferSize(
+//         cusolver_handle, itype, jobz, uplo, n,
+//         reinterpret_cast<cuDoubleComplex*>(A), lda,
+//         reinterpret_cast<cuDoubleComplex*>(B), ldb,
+//         il, iu, abstol, &m_out, W,
+//         reinterpret_cast<cuDoubleComplex*>(Z), ldz, &lwork));
+
+//     cuDoubleComplex* d_work = nullptr;
+//     int*             d_info  = nullptr;
+//     cudaErrcheck(cudaMalloc(&d_work, sizeof(cuDoubleComplex) * lwork));
+//     cudaErrcheck(cudaMalloc(&d_info,  sizeof(int)));
+
+//     cusolverErrcheck(cusolverDnZhegvdx(
+//         cusolver_handle, itype, jobz, uplo, n,
+//         reinterpret_cast<cuDoubleComplex*>(A), lda,
+//         reinterpret_cast<cuDoubleComplex*>(B), ldb,
+//         il, iu, abstol, &m_out, W,
+//         reinterpret_cast<cuDoubleComplex*>(Z), ldz, d_work, lwork, d_info));
+
+//     int h_info = 0;
+//     cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+//     if (h_info != 0) {
+//         throw std::runtime_error("hegvx: cusolverDnZhegvdx failed with info = " + std::to_string(h_info));
+//     }
+
+//     cudaErrcheck(cudaFree(d_work));
+//     cudaErrcheck(cudaFree(d_info));
+// }
+
+// --- getrf
 static inline
 void getrf(cusolverDnHandle_t& cusolver_handle, const int& m, const int& n, float* A, const int& lda, int* ipiv)
 {
@@ -528,4 +730,4 @@ void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n,
 } // namespace cuSolverConnector
 } // namespace container
 
-#endif // BASE_THIRD_PARTY_CUSOLVER_H_
\ No newline at end of file
+#endif // BASE_THIRD_PARTY_CUSOLVER_H_
diff --git a/source/source_hsolver/kernels/hegvd_op.cpp b/source/source_hsolver/kernels/hegvd_op.cpp
index ab7c520c3d..0959338005 100644
--- a/source/source_hsolver/kernels/hegvd_op.cpp
+++ b/source/source_hsolver/kernels/hegvd_op.cpp
@@ -1,7 +1,6 @@
 #include "source_hsolver/kernels/hegvd_op.h"
 #include "source_base/module_container/base/third_party/lapack.h"
 
-#include <algorithm>
 #include <fstream>
 #include <iostream>
 
@@ -146,7 +145,7 @@ struct hegvd_op<T, base_device::DEVICE_CPU>
 /**
  * @brief heevx computes the first m eigenvalues and their corresponding eigenvectors of
  * a complex generalized Hermitian-definite eigenproblem.
- * 
+ *
  * both heevx and syevx are implemented through the `evx` interface of LAPACK.
  * wrapped in LapackWrapper::xheevx
  */
@@ -349,4 +348,4 @@ template struct heevx_op<double, base_device::DEVICE_CPU>;
 template struct hegvx_op<double, base_device::DEVICE_CPU>;
 // template struct hegv_op<double, base_device::DEVICE_CPU>;
 #endif
-} // namespace hsolver
\ No newline at end of file
+} // namespace hsolver

From 39bbfe4fbeaea6dc39ee84b4a68ca6ff04fa0e03 Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Fri, 17 Oct 2025 14:18:19 +0800
Subject: [PATCH 02/17] Replace hegvd_op with lapack_hegvd in
 diago_dav_subspace

---
 source/source_hsolver/diago_dav_subspace.cpp |  9 ++--
 source/source_hsolver/diago_dav_subspace.h   |  8 +++-
 source/source_hsolver/diago_david.h          | 43 +++++++++++---------
 3 files changed, 36 insertions(+), 24 deletions(-)

diff --git a/source/source_hsolver/diago_dav_subspace.cpp b/source/source_hsolver/diago_dav_subspace.cpp
index 111ade3e43..5a525bd78d 100644
--- a/source/source_hsolver/diago_dav_subspace.cpp
+++ b/source/source_hsolver/diago_dav_subspace.cpp
@@ -4,12 +4,15 @@
 
 #include "source_base/module_device/device.h"
 #include "source_base/timer.h"
-#include "source_hsolver/kernels/hegvd_op.h"
 #include "source_base/kernels/math_kernel_op.h"
-#include "source_hsolver/kernels/bpcg_kernel_op.h" // normalize_op, precondition_op, apply_eigenvalues_op
 #include "source_base/kernels/dsp/dsp_connector.h"
+// #include "source_base/module_container/ATen/kernels/lapack.h"
+
+#include <ATen/kernels/lapack.h>
 
+#include "source_hsolver/kernels/hegvd_op.h"
 #include "source_hsolver/diag_hs_para.h"
+#include "source_hsolver/kernels/bpcg_kernel_op.h" // normalize_op, precondition_op, apply_eigenvalues_op
 
 #include <vector>
 
@@ -540,7 +543,7 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
         if (this->diag_comm.rank == 0)
         {
             syncmem_complex_op()(this->d_scc, scc, nbase * this->nbase_x);
-            hegvd_op<T, Device>()(this->ctx, nbase, this->nbase_x, this->hcc, this->d_scc, this->d_eigenvalue, this->vcc);
+            ct::kernels::lapack_hegvd<T, ct_Device>()(nbase, this->nbase_x, this->hcc, this->d_scc, this->d_eigenvalue, this->vcc);
             syncmem_var_d2h_op()((*eigenvalue_iter).data(), this->d_eigenvalue, this->nbase_x);
         }
 #endif
diff --git a/source/source_hsolver/diago_dav_subspace.h b/source/source_hsolver/diago_dav_subspace.h
index 53bcb3e57b..6ad3300b18 100644
--- a/source/source_hsolver/diago_dav_subspace.h
+++ b/source/source_hsolver/diago_dav_subspace.h
@@ -5,6 +5,8 @@
 #include "source_base/module_device/device.h"   // base_device
 #include "source_base/module_device/memory_op.h"// base_device::memory"
 
+#include "source_base/module_container/ATen/kernels/lapack.h"
+
 #include "source_hsolver/diag_comm_info.h"
 #include "source_hsolver/diag_const_nums.h"
 
@@ -189,10 +191,14 @@ class Diago_DavSubspace
     using syncmem_h2d_op = base_device::memory::synchronize_memory_op<T, Device, base_device::DEVICE_CPU>;
     using syncmem_d2h_op = base_device::memory::synchronize_memory_op<T, base_device::DEVICE_CPU, Device>;
 
+    // Note that ct_Device is different from base_device!
+    using ct_Device = typename ct::PsiToContainer<Device>::type;
+    // using hegvd_op = container::kernels::lapack_hegvd<T, ct_Device>;
+
     const T *one = nullptr, *zero = nullptr, *neg_one = nullptr;
     const T one_ = static_cast<T>(1.0), zero_ = static_cast<T>(0.0), neg_one_ = static_cast<T>(-1.0);
 };
 
 } // namespace hsolver
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/source_hsolver/diago_david.h b/source/source_hsolver/diago_david.h
index 5b65f22cec..02d2f46697 100644
--- a/source/source_hsolver/diago_david.h
+++ b/source/source_hsolver/diago_david.h
@@ -5,7 +5,10 @@
 #include "source_base/module_device/device.h"   // base_device
 #include "source_base/module_device/memory_op.h"// base_device::memory
 
+// #include "source_base/module_container/ATen/kernels/lapack.h" // container::kernels
+
 #include "source_hsolver/diag_comm_info.h"
+#include "source_hsolver/kernels/hegvd_op.h"
 
 #include <vector>
 #include <functional>
@@ -26,16 +29,16 @@ template <typename T = std::complex<double>, typename Device = base_device::DEVI
 class DiagoDavid
 {
   private:
-    // Note GetTypeReal<T>::type will 
-    // return T if T is real type(float, double), 
+    // Note GetTypeReal<T>::type will
+    // return T if T is real type(float, double),
     // otherwise return the real type of T(complex<float>, std::complex<double>)
     using Real = typename GetTypeReal<T>::type;
-  
+
   public:
 
     /**
      * @brief Constructor for the DiagoDavid class.
-     * 
+     *
      * @param[in] precondition_in Pointer to the preconditioning matrix.
      * @param[in] nband_in Number of eigenpairs required(i.e. bands).
      * @param[in] dim_in Dimension of the matrix.
@@ -44,10 +47,10 @@ class DiagoDavid
      *                      the reduced basis set before \b restart of Davidson.
      * @param[in] use_paw_in Flag indicating whether to use PAW.
      * @param[in] diag_comm_in Communication information for diagonalization.
-     * 
+     *
      * @tparam T The data type of the matrices and arrays.
      * @tparam Device The device type (base_device::DEVICE_CPU or DEVICE_GPU).
-     * 
+     *
      * @note Auxiliary memory is allocated in the constructor and deallocated in the destructor.
      */
     DiagoDavid(const Real* precondition_in,
@@ -59,10 +62,10 @@ class DiagoDavid
 
     /**
      * @brief Destructor for the DiagoDavid class.
-     * 
+     *
      * This destructor releases the dynamically allocated memory used by the class members.
      * It deletes the basis, hpsi, spsi, hcc, vcc, lagrange_matrix, and eigenvalue arrays.
-     * 
+     *
      */
     ~DiagoDavid();
 
@@ -75,7 +78,7 @@ class DiagoDavid
      * This function type is used to define a matrix-blockvector operator H.
      * For eigenvalue problem HX = λX or generalized eigenvalue problem HX = λSX,
      * this function computes the product of the Hamiltonian matrix H and a blockvector X.
-     * 
+     *
      * Called as follows:
      * hpsi(X, HX, ld, nvec) where X and HX are (ld, nvec)-shaped blockvectors.
      * Result HX = H * X is stored in HX.
@@ -84,7 +87,7 @@ class DiagoDavid
      * @param[in]  HX     Head address of output blockvector of type `T*`.
      * @param[in]  ld     Leading dimension of blockvector.
      * @param[in]  nvec   Number of vectors in a block.
-     * 
+     *
      * @warning X and HX are the exact address to read input X and store output H*X,
      * @warning both of size ld * nvec.
      */
@@ -92,7 +95,7 @@ class DiagoDavid
 
     /**
      * @brief A function type representing the SX function.
-     * 
+     *
      * nrow is leading dimension of spsi, npw is leading dimension of psi, nbands is number of vecs
      *
      * This function type is used to define a matrix-blockvector operator S.
@@ -108,9 +111,9 @@ class DiagoDavid
 
     /**
      * @brief Performs iterative diagonalization using the David algorithm.
-     * 
+     *
      * @warning Please see docs of `HPsiFunc` for more information about the hpsi mat-vec interface.
-     * 
+     *
      * @tparam T The type of the elements in the matrix.
      * @tparam Device The device type (CPU or GPU).
      * @param hpsi_func The function object that computes the matrix-blockvector product H * psi.
@@ -123,13 +126,13 @@ class DiagoDavid
      * @param ntry_max The maximum number of attempts for the diagonalization restart.
      * @param notconv_max The maximum number of bands unconverged allowed.
      * @return The total number of iterations performed during the diagonalization.
-     * 
+     *
      * @note ntry_max is an empirical parameter that should be specified in external routine, default 5
      *       notconv_max is determined by the accuracy required for the calculation, default 0
      */
     int diag(
-      const HPsiFunc& hpsi_func,  // function void hpsi(T*, T*, const int, const int) 
-      const SPsiFunc& spsi_func,  // function void spsi(T*, T*, const int, const int, const int) 
+      const HPsiFunc& hpsi_func,  // function void hpsi(T*, T*, const int, const int)
+      const SPsiFunc& spsi_func,  // function void spsi(T*, T*, const int, const int, const int)
       const int ld_psi,           // Leading dimension of the psi input
       T *psi_in,                  // Pointer to eigenvectors
       Real* eigenvalue_in,        // Pointer to store the resulting eigenvalues
@@ -218,7 +221,7 @@ class DiagoDavid
 
     /**
      * Calculates the elements of the diagonalization matrix for the DiagoDavid class.
-     * 
+     *
      * @param dim The dimension of the problem.
      * @param nbase The current dimension of the reduced basis.
      * @param nbase_x The maximum dimension of the reduced basis set.
@@ -237,7 +240,7 @@ class DiagoDavid
 
     /**
      * Refreshes the diagonalization solver by updating the basis and the reduced Hamiltonian.
-     * 
+     *
      * @param dim The dimension of the problem.
      * @param nband The number of bands.
      * @param nbase The number of basis states.
@@ -249,7 +252,7 @@ class DiagoDavid
      * @param spsi Pointer to the output array for the updated basis set (nband-th column).
      * @param hcc Pointer to the output array for the updated reduced Hamiltonian.
      * @param vcc Pointer to the output array for the updated eigenvector matrix.
-     * 
+     *
      */
     void refresh(const int& dim,
                  const int& nband,
@@ -286,7 +289,7 @@ class DiagoDavid
 
     /**
      * @brief Plans the Schmidt orthogonalization for a given number of bands.
-     * 
+     *
      * @tparam T The type of the elements in the vectors.
      * @tparam Device The device on which the computation will be performed.
      * @param nband The number of bands.

From f98a920d36e7eadfd3e6ed9cc5a503807fb512f0 Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Fri, 17 Oct 2025 17:20:20 +0800
Subject: [PATCH 03/17] Remove itype parameter for heevx that is required only
 in gv

---
 .../base/third_party/lapack.h                 | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/source/source_base/module_container/base/third_party/lapack.h b/source/source_base/module_container/base/third_party/lapack.h
index 0117734993..6e272d2b29 100644
--- a/source/source_base/module_container/base/third_party/lapack.h
+++ b/source/source_base/module_container/base/third_party/lapack.h
@@ -3,9 +3,9 @@
  * @brief This is a direct wrapper of some LAPACK routines.
  * \b Column-Major version.
  * Direct wrapping of standard LAPACK routines. (Column-Major, fortran style)
- * 
+ *
  * @warning For Row-major version, please refer to \c source/source_base/module_external/lapack_connector.h.
- * 
+ *
  * @note
  * Some slight modification are made to fit the C++ style for overloading purpose.
  * You can find some function with different parameter list than the original LAPACK routine.
@@ -294,7 +294,7 @@ void hegvx(const int itype, const char jobz, const char range, const char uplo,
 
 // wrap function of fortran lapack routine zheevx.
 static inline
-void heevx( const int itype, const char jobz, const char range, const char uplo, const int n,
+void heevx(const char jobz, const char range, const char uplo, const int n,
              float* a, const int lda,
              const float vl, const float vu, const int il, const int iu, const float abstol,
              const int m, float* w, float* z, const int ldz,
@@ -307,11 +307,11 @@ void heevx( const int itype, const char jobz, const char range, const char uplo,
 }
 // wrap function of fortran lapack routine zheevx.
 static inline
-void heevx( const int itype, const char jobz, const char range, const char uplo, const int n,
-             double* a, const int lda,
-             const double vl, const double vu, const int il, const int iu, const double abstol,
-             const int m, double* w, double* z, const int ldz,
-             double* work, const int lwork, double* rwork, int* iwork, int* ifail, int info)
+void heevx(const char jobz, const char range, const char uplo, const int n,
+            double* a, const int lda,
+            const double vl, const double vu, const int il, const int iu, const double abstol,
+            const int m, double* w, double* z, const int ldz,
+            double* work, const int lwork, double* rwork, int* iwork, int* ifail, int info)
 {
     dsyevx_(&jobz, &range, &uplo, &n,
             a, &lda, &vl, &vu, &il, &iu,
@@ -319,7 +319,7 @@ void heevx( const int itype, const char jobz, const char range, const char uplo,
             work, &lwork, rwork, iwork, ifail, &info);
 }
 static inline
-void heevx( const int itype, const char jobz, const char range, const char uplo, const int n,
+void heevx(const char jobz, const char range, const char uplo, const int n,
              std::complex<float>* a, const int lda,
              const float vl, const float vu, const int il, const int iu, const float abstol,
              const int m, float* w, std::complex<float>* z, const int ldz,
@@ -332,7 +332,7 @@ void heevx( const int itype, const char jobz, const char range, const char uplo,
 }
 // wrap function of fortran lapack routine zheevx.
 static inline
-void heevx( const int itype, const char jobz, const char range, const char uplo, const int n,
+void heevx(const char jobz, const char range, const char uplo, const int n,
              std::complex<double>* a, const int lda,
              const double vl, const double vu, const int il, const int iu, const double abstol,
              const int m, double* w, std::complex<double>* z, const int ldz,
@@ -399,17 +399,17 @@ static inline
 void potrf( const char &uplo, const int &n, float* A, const int &lda, int &info )
 {
 	spotrf_(&uplo, &n, A, &lda, &info );
-}	
+}
 static inline
 void potrf( const char &uplo, const int &n, double* A, const int &lda, int &info )
 {
 	dpotrf_(&uplo, &n, A, &lda, &info );
-}	
+}
 static inline
 void potrf( const char &uplo, const int &n, std::complex<float>* A, const int &lda, int &info )
 {
 	cpotrf_(&uplo, &n, A, &lda, &info );
-}	
+}
 static inline
 void potrf( const char &uplo, const int &n, std::complex<double>* A, const int &lda, int &info )
 {

From 679286e99d86fc0abb4fc6abe8c6368466b1c8e7 Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Fri, 17 Oct 2025 17:21:21 +0800
Subject: [PATCH 04/17] Update heevx to use correct arg list

---
 source/source_hsolver/kernels/hegvd_op.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/source/source_hsolver/kernels/hegvd_op.cpp b/source/source_hsolver/kernels/hegvd_op.cpp
index 0959338005..aa9d56239b 100644
--- a/source/source_hsolver/kernels/hegvd_op.cpp
+++ b/source/source_hsolver/kernels/hegvd_op.cpp
@@ -177,7 +177,6 @@ struct heevx_op<T, base_device::DEVICE_CPU>
         // When lwork = -1, the demension of work will be assumed
         // Assume the denmension of work by output work[0]
         lapackConnector::heevx(
-            1,          // ITYPE = 1:  A*x = (lambda)*B*x
             'V',        // JOBZ = 'V':  Compute eigenvalues and eigenvectors.
             'I',        // RANGE = 'I': the IL-th through IU-th eigenvalues will be found.
             'L',        // UPLO = 'L':  Lower triangles of A and B are stored.
@@ -211,7 +210,6 @@ struct heevx_op<T, base_device::DEVICE_CPU>
         // obtained by the zhegvx operation is (nstart * nstart) and stored in zux (internal to the function). When
         // the function is output, the data of zux will be mapped to the corresponding position of V.
         lapackConnector::heevx(
-            1,          // ITYPE = 1:  A*x = (lambda)*B*x
             'V',        // JOBZ = 'V':  Compute eigenvalues and eigenvectors.
             'I',        // RANGE = 'I': the IL-th through IU-th eigenvalues will be found.
             'L',        // UPLO = 'L':  Lower triangles of A and B are stored.

From 82b8acf93964178410be0532843bbda0a35193cd Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Sat, 18 Oct 2025 00:20:05 +0800
Subject: [PATCH 05/17] Add lapack_heevx

---
 .../ATen/kernels/cuda/lapack.cu               |  33 +++
 .../module_container/ATen/kernels/lapack.cpp  |  97 +++++++
 .../module_container/ATen/kernels/lapack.h    |  41 ++-
 .../module_container/base/macros/cuda.h       |  25 +-
 .../base/third_party/cusolver.h               | 242 ++++++++++++++++++
 5 files changed, 434 insertions(+), 4 deletions(-)

diff --git a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
index 33e1eef7f5..b72ffbb3c0 100644
--- a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
+++ b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
@@ -101,6 +101,39 @@ struct lapack_heevd<T, DEVICE_GPU> {
     }
 };
 
+template <typename T>
+struct lapack_heevx<T, DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(
+        const int n,
+        const int lda,
+        T *d_Mat,
+        const int neig,
+        Real *d_eigen_val,
+        T *d_eigen_vec)
+    {
+        // copy d_Mat to d_eigen_vec, and results will be overwritten into d_eigen_vec
+        // by cuSolver
+        cudaErrcheck(cudaMemcpy(d_eigen_vec, d_Mat, sizeof(T) * n * lda, cudaMemcpyDeviceToDevice));
+
+        int meig = 0;
+
+        cuSolverConnector::heevdx(
+            cusolver_handle,
+            n,
+            lda,
+            d_eigen_vec,
+            'V',        // jobz: compute vectors
+            'L',        // uplo: lower triangle
+            'I',        // range: by index
+            1, neig,    // il, iu
+            Real(0), Real(0), // vl, vu (unused)
+            d_eigen_val,
+            &meig
+        );
+
+    }
+};
 template <typename T>
 struct lapack_hegvd<T, DEVICE_GPU> {
     using Real = typename GetTypeReal<T>::type;
diff --git a/source/source_base/module_container/ATen/kernels/lapack.cpp b/source/source_base/module_container/ATen/kernels/lapack.cpp
index 90a3c5a0b6..3780caadf6 100644
--- a/source/source_base/module_container/ATen/kernels/lapack.cpp
+++ b/source/source_base/module_container/ATen/kernels/lapack.cpp
@@ -4,10 +4,17 @@
 
 // #include <cstring> // std::memcpy
 #include <algorithm> // std::copy
+#include <stdexcept>
+#include <string>
 
 namespace container {
 namespace kernels {
 
+inline double get_real(const std::complex<double> &x) { return x.real(); }
+inline float get_real(const std::complex<float> &x) { return x.real(); }
+inline double get_real(const double &x) { return x; }
+inline float get_real(const float &x) { return x; }
+
 template <typename T>
 struct set_matrix<T, DEVICE_CPU> {
     void operator() (
@@ -95,6 +102,96 @@ struct lapack_heevd<T, DEVICE_CPU> {
     }
 };
 
+template <typename T>
+struct lapack_heevx<T, DEVICE_CPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(
+        const int n,
+        const int lda,
+        T *Mat,
+        const int neig,
+        Real *eigen_val,
+        T *eigen_vec)
+    {
+        Tensor aux(DataTypeToEnum<T>::value, DeviceType::CpuDevice, {n * lda});
+        // Copy Mat to aux since heevx will destroy it
+        // aux = Mat
+        std::copy(Mat, Mat + n * lda, aux);
+
+        char jobz = 'V';        // Compute eigenvalues and eigenvectors
+        char range = 'I';       // Find eigenvalues in index range [il, iu]
+        char uplo = 'L';        // Use Lower triangle
+        int info = 0;
+        int found = 0;          // Number of eigenvalues found
+        // found should be iu - il + 1, i.e. found = neig
+        const int il = 1;
+        const int iu = neig;
+        Real abstol = 0.0;
+
+        // Workspace query first
+        int lwork = -1;
+        T work_query;
+        Real rwork_query;
+        int iwork_query;
+        int ifail_query;
+
+        // Dummy call to get optimal workspace size
+        // when lwork = -1
+        lapackConnector::heevx(
+            jobz, range, uplo, n,
+            aux, lda,
+            0.0, 0.0, il, iu,   // vl, vu not used when range='I'
+            abstol,
+            &found,
+            eigen_val,
+            eigen_vec, lda,
+            &work_query, lwork,
+            &rwork_query,
+            &iwork_query,
+            &ifail_query,
+            &info);
+
+        if (info != 0) {
+            throw std::runtime_error("heevx workspace query failed with info = " + std::to_string(info));
+        }
+
+        lwork = static_cast<int>(get_real(work_query));
+
+        // Allocate buffers using Tensor (RAII)
+        Tensor work(DataTypeToEnum<T>::value, DeviceType::CpuDevice, {lwork});
+        work.zero();
+
+        Tensor rwork(DataTypeToEnum<Real>::value, DeviceType::CpuDevice, {7 * n});
+        rwork.zero();
+
+        Tensor iwork(DataType::DT_INT, DeviceType::CpuDevice, {5 * n});
+        iwork.zero();
+
+        Tensor ifail(DataType::DT_INT, DeviceType::CpuDevice, {n});
+        ifail.zero();
+
+        // Actual call to heevx
+        lapackConnector::heevx(
+            jobz, range, uplo, n,
+            aux, lda,
+            0.0, 0.0, il, iu,
+            abstol,
+            &found,
+            eigen_val,
+            eigen_vec, lda,
+            work.data<T>(), lwork,
+            rwork.data<Real>(),
+            iwork.data<int>(),
+            ifail.data<int>(),
+            &info);
+
+        if (info != 0) {
+            throw std::runtime_error("heevx failed with info = " + std::to_string(info));
+        }
+
+    }
+};
+
 template <typename T>
 struct lapack_hegvd<T, DEVICE_CPU> {
     using Real = typename GetTypeReal<T>::type;
diff --git a/source/source_base/module_container/ATen/kernels/lapack.h b/source/source_base/module_container/ATen/kernels/lapack.h
index 5f201769ce..8ec3bc73e1 100644
--- a/source/source_base/module_container/ATen/kernels/lapack.h
+++ b/source/source_base/module_container/ATen/kernels/lapack.h
@@ -1,6 +1,7 @@
 #ifndef ATEN_KERNELS_LAPACK_H_
 #define ATEN_KERNELS_LAPACK_H_
 
+#include "source_base/macros.h"
 #include <ATen/core/tensor.h>
 #include <ATen/core/tensor_types.h>
 
@@ -51,6 +52,40 @@ struct lapack_heevd {
         Real* eigen_val);
 };
 
+template <typename T, typename Device>
+struct lapack_heevx {
+    using Real = typename GetTypeReal<T>::type;
+    /**
+     * @brief Computes selected eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix.
+     *
+     * This function solves the problem A*x = lambda*x, where A is a Hermitian matrix.
+     * It computes a subset of eigenvalues and, optionally, the corresponding eigenvectors.
+     *
+     * @param jobz  'N': Compute eigenvalues only; 'V': Compute eigenvalues and eigenvectors.
+     * @param range 'A': All eigenvalues; 'V': Eigenvalues in the half-open interval (vl, vu]; 'I': Eigenvalues with indices il through iu.
+     * @param uplo  'U': Upper triangle of A is stored; 'L': Lower triangle is stored.
+     * @param dim   The order of the matrix A. dim >= 0.
+     * @param Mat   On entry, the Hermitian matrix A. On exit, it may be overwritten.
+     * @param vl    Lower bound of the interval to search for eigenvalues if range == 'V'.
+     * @param vu    Upper bound of the interval to search for eigenvalues if range == 'V'.
+     * @param il    Index of the smallest eigenvalue to be returned if range == 'I'.
+     * @param iu    Index of the largest eigenvalue to be returned if range == 'I'.
+     * @param m     Output: The total number of found eigenvalues.
+     * @param eigen_val Array to store the computed eigenvalues in ascending order.
+     * @param eigen_vec If not nullptr and jobz == 'V', array to store the computed eigenvectors.
+     *
+     * @note
+     * See LAPACK ZHEEVX or CHEEVX documentation for more details.
+     *
+     */
+    void operator()(
+        const int dim,
+        const int lda,
+        T *Mat,
+        const int neig,
+        Real *eigen_val,
+        T *eigen_vec);
+};
 
 template <typename T, typename Device>
 struct lapack_hegvd {
@@ -60,8 +95,8 @@ struct lapack_hegvd {
      *
      * This function solves the problem A*x = lambda*B*x, where A and B are Hermitian matrices, and B is also positive definite.
      *
-     * @param dim The order of the matrices Mat_A and Mat_B. dim >= 0.
-     * @param lda The leading dimension of the arrays Mat_A and Mat_B. lda >= max(1, dim).
+     * @param n The order of the matrices Mat_A and Mat_B. n >= 0.
+     * @param lda The leading dimension of the arrays Mat_A and Mat_B. lda >= max(1, n).
      * @param Mat_A On entry, the Hermitian matrix A. On exit, it may be overwritten.
      * @param Mat_B On entry, the Hermitian positive definite matrix B. On exit, it may be overwritten.
      * @param eigen_val Array to store the computed eigenvalues in ascending order.
@@ -72,7 +107,7 @@ struct lapack_hegvd {
      * This function assumes that A and B have the same leading dimensions, lda.
      */
     void operator()(
-        const int dim,
+        const int n,
         const int lda,
         T *Mat_A,
         T *Mat_B,
diff --git a/source/source_base/module_container/base/macros/cuda.h b/source/source_base/module_container/base/macros/cuda.h
index ac5ced387a..e9fb2866e7 100644
--- a/source/source_base/module_container/base/macros/cuda.h
+++ b/source/source_base/module_container/base/macros/cuda.h
@@ -121,6 +121,29 @@ static inline cusolverEigType_t cublas_eig_type(const int& itype)
         throw std::runtime_error("cublas_eig_mode: unknown diag");
 }
 
+/**
+ * @brief Converts a character specifying eigenvalue range to cuSOLVER enum.
+ *
+ *        'A' or 'a' -> CUSOLVER_EIG_RANGE_ALL: all eigenvalues
+ *        'V' or 'v' -> CUSOLVER_EIG_RANGE_V:  values in [vl, vu]
+ *        'I' or 'i' -> CUSOLVER_EIG_RANGE_I:  indices in [il, iu]
+ *
+ * @param range Character indicating selection mode ('A', 'V', 'I')
+ * @return Corresponding cusolverEigRange_t enum value
+ * @throws std::runtime_error if character is invalid
+ */
+static inline cusolverEigRange_t cublas_eig_range(const char& range)
+{
+    if (range == 'A' || range == 'a')
+        return CUSOLVER_EIG_RANGE_ALL;
+    else if (range == 'V' || range == 'v')
+        return CUSOLVER_EIG_RANGE_V;
+    else if (range == 'I' || range == 'i')
+        return CUSOLVER_EIG_RANGE_I;
+    else
+        throw std::runtime_error("cublas_eig_range: unknown range '" + std::string(1, range) + "'");
+}
+
 // cuSOLVER API errors
 static const char* cusolverGetErrorEnum(cusolverStatus_t error)
 {
@@ -226,4 +249,4 @@ inline void cublasAssert(cublasStatus_t res, const char* file, int line)
 #define cudaCheckOnDebug()
 #endif
 
-#endif // BASE_MACROS_CUDA_H_
\ No newline at end of file
+#endif // BASE_MACROS_CUDA_H_
diff --git a/source/source_base/module_container/base/third_party/cusolver.h b/source/source_base/module_container/base/third_party/cusolver.h
index 6bd137d718..b372babda9 100644
--- a/source/source_base/module_container/base/third_party/cusolver.h
+++ b/source/source_base/module_container/base/third_party/cusolver.h
@@ -246,6 +246,248 @@ void heevd (cusolverDnHandle_t& cusolver_handle, const char& jobz, const char& u
     cudaErrcheck(cudaFree(d_work));
 }
 
+// =====================================================================================================
+// heevdx: Compute eigenvalues and eigenvectors of symmetric/Hermitian matrix
+// =====================================================================================================
+// --- float ---
+static inline
+void heevdx(cusolverDnHandle_t& cusolver_handle,
+    const int n,
+    const int lda,
+    float* d_A,
+    const char jobz,
+    const char uplo,
+    const char range,
+    const int il, const int iu,
+    const float vl, const float vu,
+    float* d_eigen_val,
+    int* h_meig)
+{
+    int lwork = 0;
+    int* d_info = nullptr;
+    float* d_work = nullptr;
+
+    cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
+
+    cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
+    cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
+    cusolverEigRange_t range_t = cublas_eig_range(range);
+
+    cusolverErrcheck(cusolverDnSsyevdx_bufferSize(
+        cusolver_handle,
+        jobz_t, range_t, uplo_t,
+        n, d_A, lda,
+        vl, vu, il, iu,
+        h_meig,         // ← int* output: number of eigenvalues found
+        d_eigen_val,    // ← const float* W (used for query, can be dummy)
+        &lwork          // ← int* lwork (output)
+    ));
+
+    cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
+
+    // Main call
+    cusolverErrcheck(cusolverDnSsyevdx(
+        cusolver_handle,
+        jobz_t, range_t, uplo_t,
+        n,
+        d_A, lda,
+        vl, vu, il, iu,
+        h_meig,         // ← int* output
+        d_eigen_val,    // ← float* W: eigenvalues
+        d_work, lwork,
+        d_info
+    ));
+
+    int h_info = 0;
+    cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if (h_info != 0) {
+        cudaFree(d_info); cudaFree(d_work);
+        throw std::runtime_error("heevdx (float) failed with info = " + std::to_string(h_info));
+    }
+
+    cudaFree(d_info);
+    cudaFree(d_work);
+}
+
+// --- double ---
+static inline
+void heevdx(cusolverDnHandle_t& cusolver_handle,
+    const int n,
+    const int lda,
+    double* d_A,
+    const char jobz,
+    const char uplo,
+    const char range,
+    const int il, const int iu,
+    const double vl, const double vu,
+    double* d_eigen_val,
+    int* h_meig)
+{
+    int lwork = 0;
+    int* d_info = nullptr;
+    double* d_work = nullptr;
+
+    cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
+
+    cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
+    cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
+    cusolverEigRange_t range_t = cublas_eig_range(range);
+
+    cusolverErrcheck(cusolverDnDsyevdx_bufferSize(
+        cusolver_handle,
+        jobz_t, range_t, uplo_t,
+        n, d_A, lda,
+        vl, vu, il, iu,
+        h_meig,
+        d_eigen_val,
+        &lwork
+    ));
+
+    cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
+
+    cusolverErrcheck(cusolverDnDsyevdx(
+        cusolver_handle,
+        jobz_t, range_t, uplo_t,
+        n,
+        d_A, lda,
+        vl, vu, il, iu,
+        h_meig,
+        d_eigen_val,
+        d_work, lwork,
+        d_info
+    ));
+
+    int h_info = 0;
+    cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if (h_info != 0) {
+        cudaFree(d_info); cudaFree(d_work);
+        throw std::runtime_error("heevdx (double) failed with info = " + std::to_string(h_info));
+    }
+
+    cudaFree(d_info);
+    cudaFree(d_work);
+}
+
+// --- complex<float> ---
+static inline
+void heevdx(cusolverDnHandle_t& cusolver_handle,
+    const int n,
+    const int lda,
+    std::complex<float>* d_A,
+    const char jobz,
+    const char uplo,
+    const char range,
+    const int il, const int iu,
+    const float vl, const float vu,
+    float* d_eigen_val,
+    int* h_meig)
+{
+    int lwork = 0;
+    int* d_info = nullptr;
+    cuComplex* d_work = nullptr;
+
+    cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
+
+    cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
+    cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
+    cusolverEigRange_t range_t = cublas_eig_range(range);
+
+    cusolverErrcheck(cusolverDnCheevdx_bufferSize(
+        cusolver_handle,
+        jobz_t, range_t, uplo_t,
+        n,
+        reinterpret_cast<cuComplex*>(d_A), lda,
+        vl, vu, il, iu,
+        h_meig,
+        d_eigen_val,
+        &lwork
+    ));
+
+    cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
+
+    cusolverErrcheck(cusolverDnCheevdx(
+        cusolver_handle,
+        jobz_t, range_t, uplo_t,
+        n,
+        reinterpret_cast<cuComplex*>(d_A), lda,
+        vl, vu, il, iu,
+        h_meig,
+        d_eigen_val,
+        d_work, lwork,
+        d_info
+    ));
+
+    int h_info = 0;
+    cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if (h_info != 0) {
+        cudaFree(d_info); cudaFree(d_work);
+        throw std::runtime_error("heevdx (complex<float>) failed with info = " + std::to_string(h_info));
+    }
+
+    cudaFree(d_info);
+    cudaFree(d_work);
+}
+
+// --- complex<double> ---
+static inline
+void heevdx(cusolverDnHandle_t& cusolver_handle,
+    const int n,
+    const int lda,
+    std::complex<double>* d_A,
+    const char jobz,
+    const char uplo,
+    const char range,
+    const int il, const int iu,
+    const double vl, const double vu,
+    double* d_eigen_val,
+    int* h_meig)
+{
+    int lwork = 0;
+    int* d_info = nullptr;
+    cuDoubleComplex* d_work = nullptr;
+
+    cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
+
+    cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
+    cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
+    cusolverEigRange_t range_t = cublas_eig_range(range);
+
+    cusolverErrcheck(cusolverDnZheevdx_bufferSize(
+        cusolver_handle,
+        jobz_t, range_t, uplo_t,
+        n,
+        reinterpret_cast<cuDoubleComplex*>(d_A), lda,
+        vl, vu, il, iu,
+        h_meig,
+        d_eigen_val,
+        &lwork
+    ));
+
+    cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
+
+    cusolverErrcheck(cusolverDnZheevdx(
+        cusolver_handle,
+        jobz_t, range_t, uplo_t,
+        n,
+        reinterpret_cast<cuDoubleComplex*>(d_A), lda,
+        vl, vu, il, iu,
+        h_meig,
+        d_eigen_val,
+        d_work, lwork,
+        d_info
+    ));
+
+    int h_info = 0;
+    cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if (h_info != 0) {
+        cudaFree(d_info); cudaFree(d_work);
+        throw std::runtime_error("heevdx (complex<double>) failed with info = " + std::to_string(h_info));
+    }
+
+    cudaFree(d_info);
+    cudaFree(d_work);
+}
+
 static inline
 void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& jobz, const char& uplo, const int& n, float* A, const int& lda, float* B, const int& ldb, float * W)
 {

From 93da75065fe6aba566e96c9c251fd0e7a4dcf55c Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Sat, 18 Oct 2025 01:01:09 +0800
Subject: [PATCH 06/17] Fix lapack_heevx and add template instantiation

---
 .../ATen/kernels/cuda/lapack.cu               |  7 +++++-
 .../module_container/ATen/kernels/lapack.cpp  | 23 ++++++++++++-------
 .../module_container/ATen/kernels/lapack.h    |  2 +-
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
index b72ffbb3c0..bef3d04044 100644
--- a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
+++ b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
@@ -107,7 +107,7 @@ struct lapack_heevx<T, DEVICE_GPU> {
     void operator()(
         const int n,
         const int lda,
-        T *d_Mat,
+        const T *d_Mat,
         const int neig,
         Real *d_eigen_val,
         T *d_eigen_vec)
@@ -232,6 +232,11 @@ template struct lapack_heevd<double, DEVICE_GPU>;
 template struct lapack_heevd<std::complex<float>,  DEVICE_GPU>;
 template struct lapack_heevd<std::complex<double>, DEVICE_GPU>;
 
+template struct lapack_heevx<float, DEVICE_GPU>;
+template struct lapack_heevx<double, DEVICE_GPU>;
+template struct lapack_heevx<std::complex<float>, DEVICE_GPU>;
+template struct lapack_heevx<std::complex<double>, DEVICE_GPU>;
+
 template struct lapack_hegvd<float,  DEVICE_GPU>;
 template struct lapack_hegvd<double, DEVICE_GPU>;
 template struct lapack_hegvd<std::complex<float>,  DEVICE_GPU>;
diff --git a/source/source_base/module_container/ATen/kernels/lapack.cpp b/source/source_base/module_container/ATen/kernels/lapack.cpp
index 3780caadf6..0003c23a46 100644
--- a/source/source_base/module_container/ATen/kernels/lapack.cpp
+++ b/source/source_base/module_container/ATen/kernels/lapack.cpp
@@ -1,9 +1,11 @@
+#include "source_base/module_device/types.h"
 #include <ATen/kernels/lapack.h>
 
 #include <base/third_party/lapack.h>
 
 // #include <cstring> // std::memcpy
 #include <algorithm> // std::copy
+#include <complex>
 #include <stdexcept>
 #include <string>
 
@@ -108,7 +110,7 @@ struct lapack_heevx<T, DEVICE_CPU> {
     void operator()(
         const int n,
         const int lda,
-        T *Mat,
+        const T *Mat,
         const int neig,
         Real *eigen_val,
         T *eigen_vec)
@@ -116,7 +118,7 @@ struct lapack_heevx<T, DEVICE_CPU> {
         Tensor aux(DataTypeToEnum<T>::value, DeviceType::CpuDevice, {n * lda});
         // Copy Mat to aux since heevx will destroy it
         // aux = Mat
-        std::copy(Mat, Mat + n * lda, aux);
+        std::copy(Mat, Mat + n * lda, aux.data<T>());
 
         char jobz = 'V';        // Compute eigenvalues and eigenvectors
         char range = 'I';       // Find eigenvalues in index range [il, iu]
@@ -139,17 +141,17 @@ struct lapack_heevx<T, DEVICE_CPU> {
         // when lwork = -1
         lapackConnector::heevx(
             jobz, range, uplo, n,
-            aux, lda,
+            aux.data<T>(), lda,
             0.0, 0.0, il, iu,   // vl, vu not used when range='I'
             abstol,
-            &found,
+            found,
             eigen_val,
             eigen_vec, lda,
             &work_query, lwork,
             &rwork_query,
             &iwork_query,
             &ifail_query,
-            &info);
+            info);
 
         if (info != 0) {
             throw std::runtime_error("heevx workspace query failed with info = " + std::to_string(info));
@@ -173,17 +175,17 @@ struct lapack_heevx<T, DEVICE_CPU> {
         // Actual call to heevx
         lapackConnector::heevx(
             jobz, range, uplo, n,
-            aux, lda,
+            aux.data<T>(), lda,
             0.0, 0.0, il, iu,
             abstol,
-            &found,
+            found,
             eigen_val,
             eigen_vec, lda,
             work.data<T>(), lwork,
             rwork.data<Real>(),
             iwork.data<int>(),
             ifail.data<int>(),
-            &info);
+            info);
 
         if (info != 0) {
             throw std::runtime_error("heevx failed with info = " + std::to_string(info));
@@ -388,6 +390,11 @@ template struct lapack_heevd<double, DEVICE_CPU>;
 template struct lapack_heevd<std::complex<float>,  DEVICE_CPU>;
 template struct lapack_heevd<std::complex<double>, DEVICE_CPU>;
 
+template struct lapack_heevx<float, DEVICE_CPU>;
+template struct lapack_heevx<double, DEVICE_CPU>;
+template struct lapack_heevx<std::complex<float>, DEVICE_CPU>;
+template struct lapack_heevx<std::complex<double>, DEVICE_CPU>;
+
 template struct lapack_hegvd<float,  DEVICE_CPU>;
 template struct lapack_hegvd<double, DEVICE_CPU>;
 template struct lapack_hegvd<std::complex<float>,  DEVICE_CPU>;
diff --git a/source/source_base/module_container/ATen/kernels/lapack.h b/source/source_base/module_container/ATen/kernels/lapack.h
index 8ec3bc73e1..32778a5cc5 100644
--- a/source/source_base/module_container/ATen/kernels/lapack.h
+++ b/source/source_base/module_container/ATen/kernels/lapack.h
@@ -81,7 +81,7 @@ struct lapack_heevx {
     void operator()(
         const int dim,
         const int lda,
-        T *Mat,
+        const T *Mat,
         const int neig,
         Real *eigen_val,
         T *eigen_vec);

From 233254a2eb76e8c45698325737c2d2d91c243d94 Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Sat, 18 Oct 2025 01:01:52 +0800
Subject: [PATCH 07/17] Replace hsolver::heevx_op with
 ct::kernels::lapack_heevx in diago_david

---
 source/source_hsolver/diago_david.cpp | 12 +++++++-----
 source/source_hsolver/diago_david.h   |  4 +++-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/source/source_hsolver/diago_david.cpp b/source/source_hsolver/diago_david.cpp
index 607041089f..bf390104af 100644
--- a/source/source_hsolver/diago_david.cpp
+++ b/source/source_hsolver/diago_david.cpp
@@ -12,7 +12,7 @@ using namespace hsolver;
 
 
 template <typename T, typename Device>
-DiagoDavid<T, Device>::DiagoDavid(const Real* precondition_in, 
+DiagoDavid<T, Device>::DiagoDavid(const Real* precondition_in,
                                   const int nband_in,
                                   const int dim_in,
                                   const int david_ndim_in,
@@ -80,7 +80,7 @@ DiagoDavid<T, Device>::DiagoDavid(const Real* precondition_in,
     resmem_complex_op()(this->vcc, nbase_x * nbase_x, "DAV::vcc");
     setmem_complex_op()(this->vcc, 0, nbase_x * nbase_x);
     //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-    
+
     // lagrange_matrix(nband, nband); // for orthogonalization
     resmem_complex_op()(this->lagrange_matrix, nband * nband);
     setmem_complex_op()(this->lagrange_matrix, 0, nband * nband);
@@ -409,7 +409,7 @@ void DiagoDavid<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
     // basis[nbase] = basis[nbase] - spsi * vc_ev_vector
     //              = hpsi - spsi * lambda * vcc
     //              = (H - lambda * S) * psi * vcc
-    //              = (H - lambda * S) * psi_new 
+    //              = (H - lambda * S) * psi_new
     //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
     ModuleBase::gemm_op<T, Device>()('N',
                                      'N',
@@ -622,7 +622,8 @@ void DiagoDavid<T, Device>::diag_zhegvx(const int& nbase,
             resmem_var_op()(eigenvalue_gpu, nbase_x);
             syncmem_var_h2d_op()(eigenvalue_gpu, this->eigenvalue, nbase_x);
 
-            heevx_op<T, Device>()(this->ctx, nbase, nbase_x, hcc, nband, eigenvalue_gpu, vcc);
+            // heevx_op<T, Device>()(this->ctx, nbase, nbase_x, hcc, nband, eigenvalue_gpu, vcc);
+            ct::kernels::lapack_heevx<T, ct_Device>()(nbase, nbase_x, hcc, nband, eigenvalue_gpu, vcc);
 
             syncmem_var_d2h_op()(this->eigenvalue, eigenvalue_gpu, nbase_x);
             delmem_var_op()(eigenvalue_gpu);
@@ -630,7 +631,8 @@ void DiagoDavid<T, Device>::diag_zhegvx(const int& nbase,
         }
         else
         {
-            heevx_op<T, Device>()(this->ctx, nbase, nbase_x, hcc, nband, this->eigenvalue, vcc);
+            //heevx_op<T, Device>()(this->ctx, nbase, nbase_x, hcc, nband, this->eigenvalue, vcc);
+            ct::kernels::lapack_heevx<T, ct_Device>()(nbase, nbase_x, hcc, nband, this->eigenvalue, vcc);
         }
     }
 
diff --git a/source/source_hsolver/diago_david.h b/source/source_hsolver/diago_david.h
index 02d2f46697..75a745d326 100644
--- a/source/source_hsolver/diago_david.h
+++ b/source/source_hsolver/diago_david.h
@@ -5,7 +5,7 @@
 #include "source_base/module_device/device.h"   // base_device
 #include "source_base/module_device/memory_op.h"// base_device::memory
 
-// #include "source_base/module_container/ATen/kernels/lapack.h" // container::kernels
+#include "source_base/module_container/ATen/kernels/lapack.h" // container::kernels
 
 #include "source_hsolver/diag_comm_info.h"
 #include "source_hsolver/kernels/hegvd_op.h"
@@ -341,6 +341,8 @@ class DiagoDavid
     using syncmem_h2d_op = base_device::memory::synchronize_memory_op<T, Device, base_device::DEVICE_CPU>;
     using syncmem_d2h_op = base_device::memory::synchronize_memory_op<T, base_device::DEVICE_CPU, Device>;
 
+    // Note that ct_Device is different from base_device!
+    using ct_Device = typename ct::PsiToContainer<Device>::type;
     // using hpsi_info = typename hamilt::Operator<T, Device>::hpsi_info; // Dependence of hpsi removed
 
     const T *one = nullptr, *zero = nullptr, *neg_one = nullptr;

From c42bbc04f202dac17662debe1177a4b203a314a3 Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Mon, 20 Oct 2025 01:37:21 +0800
Subject: [PATCH 08/17] Add lapack_hegvx

---
 .../ATen/kernels/cuda/lapack.cu               |  53 +-
 .../module_container/ATen/kernels/lapack.cpp  | 180 +++---
 .../module_container/ATen/kernels/lapack.h    |  64 +-
 .../base/third_party/cusolver.h               | 567 ++++++++++++------
 4 files changed, 540 insertions(+), 324 deletions(-)

diff --git a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
index bef3d04044..34fca37898 100644
--- a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
+++ b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
@@ -149,26 +149,40 @@ struct lapack_hegvd<T, DEVICE_GPU> {
         const char jobz = 'V';
         const char uplo = 'U';
         cudaErrcheck(cudaMemcpy(eigen_vec, Mat_A, sizeof(T) * dim * lda, cudaMemcpyDeviceToDevice));
-        cuSolverConnector::hegvd(cusolver_handle, itype, jobz, uplo, dim, eigen_vec, lda, Mat_B, lda, eigen_val);
+        cuSolverConnector::hegvd(cusolver_handle, itype, jobz, uplo, dim,
+                eigen_vec, lda, Mat_B, lda,
+                eigen_val);
     }
 };
 
-// template <typename T>
-// struct lapack_hegvx<T, DEVICE_GPU> {
-//     using Real = typename GetTypeReal<T>::type;
-//     void operator()(
-//         const int n,
-//         const int lda,
-//         T *A,
-//         T *B,
-//         const int m,
-//         Real *eigen_val,
-//         T *eigen_vec)
-//     {
-//         cuSolverConnector::hegvx(cusolver_handle, n, lda, A, B, m, eigen_val, eigen_vec);
-//     }
-// };
-//
+template <typename T>
+struct lapack_hegvx<T, DEVICE_GPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(
+        const int n,
+        const int lda,
+        T *A,
+        T *B,
+        const int m,
+        Real *eigen_val,
+        T *eigen_vec)
+    {
+        const int itype = 1;
+        const char jobz = 'V';
+        const char range = 'I';
+        const char uplo = 'U';
+        int meig = 0;
+        cuSolverConnector::hegvdx(cusolver_handle,
+            itype, jobz, range, uplo,
+            n, lda, A, B,
+            Real(0), Real(0),
+            1, m, &meig,
+            eigen_val, eigen_vec);
+    }
+};
+
+
+
 template <typename T>
 struct lapack_getrf<T, DEVICE_GPU> {
     void operator()(
@@ -242,6 +256,11 @@ template struct lapack_hegvd<double, DEVICE_GPU>;
 template struct lapack_hegvd<std::complex<float>,  DEVICE_GPU>;
 template struct lapack_hegvd<std::complex<double>, DEVICE_GPU>;
 
+template struct lapack_hegvx<float,  DEVICE_GPU>;
+template struct lapack_hegvx<double, DEVICE_GPU>;
+template struct lapack_hegvx<std::complex<float>,  DEVICE_GPU>;
+template struct lapack_hegvx<std::complex<double>, DEVICE_GPU>;
+
 template struct lapack_getrf<float,  DEVICE_GPU>;
 template struct lapack_getrf<double, DEVICE_GPU>;
 template struct lapack_getrf<std::complex<float>,  DEVICE_GPU>;
diff --git a/source/source_base/module_container/ATen/kernels/lapack.cpp b/source/source_base/module_container/ATen/kernels/lapack.cpp
index 0003c23a46..185707ff8f 100644
--- a/source/source_base/module_container/ATen/kernels/lapack.cpp
+++ b/source/source_base/module_container/ATen/kernels/lapack.cpp
@@ -217,7 +217,7 @@ struct lapack_hegvd<T, DEVICE_CPU> {
 
         const int itype = 1;
         const char jobz = 'V';
-        const char uplo = 'U';
+        const char uplo = 'L';
         int info = 0;
         int lwork = std::max(2 * dim + dim * dim, 1 + 6 * dim + 2 * dim * dim);
         Tensor work(DataTypeToEnum<T>::value, DeviceType::CpuDevice, {lwork});
@@ -240,80 +240,105 @@ struct lapack_hegvd<T, DEVICE_CPU> {
 };
 
 
-// template <typename T>
-// struct lapack_hegvx<T, DEVICE_CPU> {
-//     using Real = typename GetTypeReal<T>::type;
-//     void operator()(
-//         const int n,
-//         const int lda,
-//         T *A,
-//         T *B,
-//         const int m,
-//         Real *eigen_val,
-//         T *eigen_vec)
-//     {
-//         int info = 0;
-
-//         int mm = m;
-
-//         int lwork = -1;
-
-//         T *work = new T[1];
-//         Real *rwork = new Real[7 * n];
-//         int *iwork = new int[5 * n];
-//         int *ifail = new int[n];
-
-//         // set lwork = -1 to query optimal work size
-//         lapackConnector::hegvx(1, // ITYPE = 1:  A*x = (lambda)*B*x
-//             'V', 'I', 'U',
-//             n, A, lda, B, lda,
-//             0.0, 0.0,
-//             1, m, 0.0, mm,
-//             eigen_val, eigen_vec, lda,
-//             work,
-//             lwork,      // lwork = 1, query optimal size.
-//             rwork, iwork, ifail,
-//             info);
-
-//         // !>  If LWORK = -1, then a workspace query is assumed; the routine
-//         // !>  only calculates the optimal size of the WORK array, returns
-//         // !>  this value as the first entry of the WORK array.
-//         lwork = int(get_real(work[0]));
-//         delete[] work;
-//         work = new T[lwork];
-
-//         lapackConnector::hegvx(
-//             1, // ITYPE = 1:  A*x = (lambda)*B*x
-//             'V',    // JOBZ = 'V':  Compute eigenvalues and eigenvectors.
-//             'I',    // RANGE = 'I': the IL-th through IU-th eigenvalues will be found.
-//             'U',    // UPLO = 'U':  Upper triangles of A and B are stored.
-//             n,      // order of the matrices A and B.
-//             A,      // A is COMPLEX*16 array  dimension (LDA, N)
-//             lda,    // leading dimension of the array A.
-//             B,      // B is COMPLEX*16 array, dimension (LDB, N)
-//             lda,    // assume that leading dimension of B is the same as A.
-//             0.0,    // VL, Not referenced if RANGE = 'A' or 'I'.
-//             0.0,    // VU, Not referenced if RANGE = 'A' or 'I'.
-//             1,      // IL: If RANGE='I', the index of the smallest eigenvalue to be returned. 1 <= IL <= IU <= N,
-//             m,      // IU: If RANGE='I', the index of the largest eigenvalue to be returned. 1 <= IL <= IU <= N,
-//             0.0,    // ABSTOL
-//             mm,     // M: The total number of eigenvalues found.  0 <= M <= N. if RANGE = 'I', M = IU-IL+1.
-//             eigen_val,  // W store eigenvalues
-//             eigen_vec,  // Z store eigenvector
-//             lda,        // LDZ: The leading dimension of the array Z.
-//             work,
-//             lwork,
-//             rwork,
-//             iwork,
-//             ifail,
-//             info);
-
-//         delete[] work;
-//         delete[] rwork;
-//         delete[] iwork;
-//         delete[] ifail;
-//     }
-// };
+template <typename T>
+struct lapack_hegvx<T, DEVICE_CPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(
+        const int n,
+        const int lda,
+        T *Mat_A,
+        T *Mat_B,
+        const int m,
+        Real *eigen_val,
+        T *eigen_vec)
+    {
+        // first copy Mat_A and Mat_B to auxiliary memory
+        // to avoid the origin block being overwritten by hegvx
+        Tensor aux_A(DataTypeToEnum<T>::value, DeviceType::CpuDevice, {n * lda});
+        std::copy(Mat_A, Mat_A + n * lda, aux_A.data<T>());
+        Tensor aux_B(DataTypeToEnum<T>::value, DeviceType::CpuDevice, {n * lda});
+        std::copy(Mat_B, Mat_B + n * lda, aux_B.data<T>());
+
+        const int itype = 1;    // ITYPE = 1:  A*x = (lambda)*B*x
+        const char jobz = 'V';// JOBZ = 'V':  Compute eigenvalues and eigenvectors.
+        const char range = 'I'; // RANGE = 'I': the IL-th through IU-th eigenvalues will be found.
+        const char uplo = 'L'; // UPLO = 'L':  Lower triangles of A and B are stored.
+
+        const int il = 1;
+        const int iu = m;
+        int found = m; // Found, should be iu - il + 1
+        int info = 0;
+
+        int lwork = -1;
+
+        T work_query;
+        Real rwork_query;
+
+        // set lwork = -1 to query optimal work size
+        lapackConnector::hegvx(
+                    itype, jobz, range, uplo,
+                    n,
+                    aux_A.data<T>(), lda,        // A (in/out)
+                    aux_B.data<T>(), lda,        // B (in/out)
+                    0.0, 0.0,                    // VL, VU (not used)
+                    il, iu,                      // IL, IU
+                    Real(0.0),                   // ABSTOL
+                    found,                       // M (output)
+                    eigen_val,                   // W (output)
+                    eigen_vec, lda,              // Z (output)
+                    &work_query,                 // WORK (query)
+                    lwork,
+                    &rwork_query,                // RWORK (query)
+                    static_cast<int*>(nullptr),  // IWORK (query)
+                    static_cast<int*>(nullptr),  // IFAIL (query)
+                    info);
+
+        // !>  If LWORK = -1, then a workspace query is assumed; the routine
+        // !>  only calculates the optimal size of the WORK array, returns
+        // !>  this value as the first entry of the WORK array.
+        lwork = static_cast<int>(get_real(work_query));
+        lwork = std::max(lwork, 1);
+
+        // work space
+        Tensor work(DataTypeToEnum<T>::value, DeviceType::CpuDevice, {lwork});
+        work.zero();
+
+        const int lrwork = 7 * n;
+        Tensor rwork(DataTypeToEnum<Real>::value, DeviceType::CpuDevice, {lrwork});
+        rwork.zero();
+
+        const int liwork = 5 * n;
+        Tensor iwork(DataType::DT_INT, DeviceType::CpuDevice, {liwork});
+        iwork.zero();
+
+        std::vector<int> ifail(n);
+
+        lapackConnector::hegvx(
+                    itype, jobz, range, uplo,
+                    n,
+                    aux_A.data<T>(), lda,        // A
+                    aux_B.data<T>(), lda,        // B
+                    0.0, 0.0,                    // VL, VU
+                    il, iu,                      // IL, IU
+                    Real(0.0),                   // ABSTOL
+                    found,                       // M (output)
+                    eigen_val,                   // W
+                    eigen_vec, lda,              // Z (output)
+                    work.data<T>(),              // WORK
+                    lwork,
+                    rwork.data<Real>(),          // RWORK
+                    iwork.data<int>(),           // IWORK
+                    ifail.data(),                // IFAIL
+                    info);
+
+        if (info < 0) {
+            throw std::runtime_error("hegvx failed: illegal argument #" + std::to_string(-info));
+        }
+        if (info > 0) {
+            throw std::runtime_error("hegvx failed to converge. Number of converged eigenvalues: " + std::to_string(info));
+        }
+    }
+};
 
 template <typename T>
 struct lapack_getrf<T, DEVICE_CPU> {
@@ -400,6 +425,11 @@ template struct lapack_hegvd<double, DEVICE_CPU>;
 template struct lapack_hegvd<std::complex<float>,  DEVICE_CPU>;
 template struct lapack_hegvd<std::complex<double>, DEVICE_CPU>;
 
+template struct lapack_hegvx<float,  DEVICE_CPU>;
+template struct lapack_hegvx<double, DEVICE_CPU>;
+template struct lapack_hegvx<std::complex<float>,  DEVICE_CPU>;
+template struct lapack_hegvx<std::complex<double>, DEVICE_CPU>;
+
 template struct lapack_getrf<float,  DEVICE_CPU>;
 template struct lapack_getrf<double, DEVICE_CPU>;
 template struct lapack_getrf<std::complex<float>,  DEVICE_CPU>;
diff --git a/source/source_base/module_container/ATen/kernels/lapack.h b/source/source_base/module_container/ATen/kernels/lapack.h
index 32778a5cc5..2914ee9fe6 100644
--- a/source/source_base/module_container/ATen/kernels/lapack.h
+++ b/source/source_base/module_container/ATen/kernels/lapack.h
@@ -115,38 +115,38 @@ struct lapack_hegvd {
         T *eigen_vec);
 };
 
-// template <typename T, typename Device>
-// struct lapack_hegvx {
-//     using Real = typename GetTypeReal<T>::type;
-//     /**
-//      * @ brief hegvx computes the first m eigenvalues and their corresponding eigenvectors of
-//      * a complex generalized Hermitian-definite eigenproblem.
-//      *
-//      * In this op, the CPU version is implemented through the `hegvx` interface, and the CUDA version
-//      * is implemented through the `evd` interface and acquires the first m eigenpairs
-//      *
-//      * hegvx 'V' 'I' 'U'  is used to compute the first m eigenpairs of the problem
-//      *
-//      * @param n The order of the matrices A and B. n >= 0.
-//      * @param lda The leading dimension of the array A and B. lda >= max(1, n).
-//      * @param A On entry, the Hermitian matrix A. On exit, if info = 0, A contains the matrix Z of eigenvectors.
-//      * @param B On entry, the Hermitian positive definite matrix B. On exit, the triangular factor from the Cholesky factorization of B.
-//      * @param m The number of eigenvalues and eigenvectors to be found. 0 < m <= n.
-//      * @param eigen_val The first m eigenvalues in ascending order.
-//      * @param eigen_vec The first m columns contain the orthonormal eigenvectors of the matrix A corresponding to the selected eigenvalues.
-//      *
-//      * @note
-//      * See LAPACK ZHEGVX doc for more details.
-//      */
-//     void operator()(
-//         const int n,
-//         const int lda,
-//         T *A,
-//         T *B,
-//         const int m,
-//         Real *eigen_val,
-//         T *eigen_vec);
-// };
+template <typename T, typename Device>
+struct lapack_hegvx {
+    using Real = typename GetTypeReal<T>::type;
+    /**
+     * @ brief hegvx computes the first m eigenvalues and their corresponding eigenvectors of
+     * a complex generalized Hermitian-definite eigenproblem.
+     *
+     * In this op, the CPU version is implemented through the `hegvx` interface, and the CUDA version
+     * is implemented through the `evd` interface and acquires the first m eigenpairs
+     *
+     * hegvx 'V' 'I' 'U'  is used to compute the first m eigenpairs of the problem
+     *
+     * @param n The order of the matrices A and B. n >= 0.
+     * @param lda The leading dimension of the array A and B. lda >= max(1, n).
+     * @param A On entry, the Hermitian matrix A. On exit, if info = 0, A contains the matrix Z of eigenvectors.
+     * @param B On entry, the Hermitian positive definite matrix B. On exit, the triangular factor from the Cholesky factorization of B.
+     * @param m The number of eigenvalues and eigenvectors to be found. 0 < m <= n.
+     * @param eigen_val The first m eigenvalues in ascending order.
+     * @param eigen_vec The first m columns contain the orthonormal eigenvectors of the matrix A corresponding to the selected eigenvalues.
+     *
+     * @note
+     * See LAPACK ZHEGVX doc for more details.
+     */
+    void operator()(
+        const int n,
+        const int lda,
+        T *Mat_A,
+        T *Mat_B,
+        const int m,
+        Real *eigen_val,
+        T *eigen_vec);
+};
 
 
 template <typename T, typename Device>
diff --git a/source/source_base/module_container/base/third_party/cusolver.h b/source/source_base/module_container/base/third_party/cusolver.h
index b372babda9..affdc4ca48 100644
--- a/source/source_base/module_container/base/third_party/cusolver.h
+++ b/source/source_base/module_container/base/third_party/cusolver.h
@@ -593,206 +593,373 @@ void hegvd (cusolverDnHandle_t& cusolver_handle, const int& itype, const char& j
     cudaErrcheck(cudaFree(d_work));
 }
 
-// -----------------------------
-// CUDA hegvx implementations
-// -----------------------------
-
-// static inline
-// void hegvx(cusolverDnHandle_t& cusolver_handle,
-//            const int& itype,
-//            const char& jobz,
-//            const char& uplo,
-//            const int& n,
-//            float* A,
-//            const int& lda,
-//            float* B,
-//            const int& ldb,
-//            const int& il,
-//            const int& iu,
-//            const float& abstol,
-//            int& m_out,
-//            float* W,
-//            float* Z,
-//            const int& ldz) {
-//     // Step 1: Query workspace size
-//     int lwork = 0;
-//     cusolverErrcheck(cusolverDnSsygvdx_bufferSize(
-//         cusolver_handle,
-//         cublas_eig_type(itype),       // itype
-//         cublas_eig_mode(jobz),        // jobz
-//         cublas_fill_mode(uplo),       // uplo
-//         n, A, lda, B, ldb,
-//         il, iu,                       // range: index interval
-//         abstol,                       // tolerance
-//         &m_out,                       // output: number of eigenvalues found
-//         W,                            // eigenvalues
-//         Z, ldz,                       // eigenvectors
-//         &lwork));                      // workspace size
-
-//     // Step 2: Allocate workspace and info
-//     float* d_work = nullptr;
-//     int*   d_info  = nullptr;
-//     cudaErrcheck(cudaMalloc(&d_work, sizeof(float) * lwork));
-//     cudaErrcheck(cudaMalloc(&d_info,  sizeof(int)));
-
-//     // Step 3: Call the solver
-//     cusolverErrcheck(cusolverDnSsygvdx(
-//         cusolver_handle,
-//         cublas_eig_type(itype),
-//         cublas_eig_mode(jobz),
-//         cublas_fill_mode(uplo),
-//         n, A, lda, B, ldb,
-//         il, iu, abstol, &m_out, W, Z, ldz, d_work, lwork, d_info));
-
-//     // Step 4: Check result
-//     int h_info = 0;
-//     cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
-//     if (h_info != 0) {
-//         throw std::runtime_error("hegvx: cusolverDnSsygvdx failed with info = " + std::to_string(h_info));
-//     }
-
-//     // Cleanup
-//     cudaErrcheck(cudaFree(d_work));
-//     cudaErrcheck(cudaFree(d_info));
-// }
-
-// Double precision
-// static inline
-// void hegvx(cusolverDnHandle_t& cusolver_handle,
-//            const int& itype,
-//            const char& jobz,
-//            const char& uplo,
-//            const int& n,
-//            double* A,
-//            const int& lda,
-//            double* B,
-//            const int& ldb,
-//            const int& il,
-//            const int& iu,
-//            const double& abstol,
-//            int& m_out,
-//            double* W,
-//            double* Z,
-//            const int& ldz) {
-//     int lwork = 0;
-//     cusolverErrcheck(cusolverDnDsygvdx_bufferSize(
-//         cusolver_handle, itype, jobz, uplo, n, A, lda, B, ldb,
-//         il, iu, abstol, &m_out, W, Z, ldz, &lwork));
-
-//     double* d_work = nullptr;
-//     int*    d_info  = nullptr;
-//     cudaErrcheck(cudaMalloc(&d_work, sizeof(double) * lwork));
-//     cudaErrcheck(cudaMalloc(&d_info,  sizeof(int)));
-
-//     cusolverErrcheck(cusolverDnDsygvdx(
-//         cusolver_handle, itype, jobz, uplo, n, A, lda, B, ldb,
-//         il, iu, abstol, &m_out, W, Z, ldz, d_work, lwork, d_info));
-
-//     int h_info = 0;
-//     cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
-//     if (h_info != 0) {
-//         throw std::runtime_error("hegvx: cusolverDnDsygvdx failed with info = " + std::to_string(h_info));
-//     }
-
-//     cudaErrcheck(cudaFree(d_work));
-//     cudaErrcheck(cudaFree(d_info));
-// }
-
-// // Complex single precision
-// static inline
-// void hegvx(cusolverDnHandle_t& cusolver_handle,
-//            const int& itype,
-//            const char& jobz,
-//            const char& uplo,
-//            const int& n,
-//            cuComplex* A,
-//            const int& lda,
-//            cuComplex* B,
-//            const int& ldb,
-//            const int& il,
-//            const int& iu,
-//            const float& abstol,
-//            int& m_out,
-//            float* W,
-//            cuComplex* Z,
-//            const int& ldz) {
-//     int lwork = 0;
-//     cusolverErrcheck(cusolverDnChegvdx_bufferSize(
-//         cusolver_handle, itype, jobz, uplo, n,
-//         reinterpret_cast<cuFloatComplex*>(A), lda,
-//         reinterpret_cast<cuFloatComplex*>(B), ldb,
-//         il, iu, abstol, &m_out, W,
-//         reinterpret_cast<cuFloatComplex*>(Z), ldz, &lwork));
-
-//     cuComplex* d_work = nullptr;
-//     int*       d_info  = nullptr;
-//     cudaErrcheck(cudaMalloc(&d_work, sizeof(cuComplex) * lwork));
-//     cudaErrcheck(cudaMalloc(&d_info,  sizeof(int)));
-
-//     cusolverErrcheck(cusolverDnChegvdx(
-//         cusolver_handle, itype, jobz, uplo, n,
-//         reinterpret_cast<cuFloatComplex*>(A), lda,
-//         reinterpret_cast<cuFloatComplex*>(B), ldb,
-//         il, iu, abstol, &m_out, W,
-//         reinterpret_cast<cuFloatComplex*>(Z), ldz, d_work, lwork, d_info));
-
-//     int h_info = 0;
-//     cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
-//     if (h_info != 0) {
-//         throw std::runtime_error("hegvx: cusolverDnChegvdx failed with info = " + std::to_string(h_info));
-//     }
-
-//     cudaErrcheck(cudaFree(d_work));
-//     cudaErrcheck(cudaFree(d_info));
-// }
-
-// // Complex double precision
-// static inline
-// void hegvx(cusolverDnHandle_t& cusolver_handle,
-//            const int& itype,
-//            const char& jobz,
-//            const char& uplo,
-//            const int& n,
-//            cuDoubleComplex* A,
-//            const int& lda,
-//            cuDoubleComplex* B,
-//            const int& ldb,
-//            const int& il,
-//            const int& iu,
-//            const double& abstol,
-//            int& m_out,
-//            double* W,
-//            cuDoubleComplex* Z,
-//            const int& ldz) {
-//     int lwork = 0;
-//     cusolverErrcheck(cusolverDnZhegvdx_bufferSize(
-//         cusolver_handle, itype, jobz, uplo, n,
-//         reinterpret_cast<cuDoubleComplex*>(A), lda,
-//         reinterpret_cast<cuDoubleComplex*>(B), ldb,
-//         il, iu, abstol, &m_out, W,
-//         reinterpret_cast<cuDoubleComplex*>(Z), ldz, &lwork));
-
-//     cuDoubleComplex* d_work = nullptr;
-//     int*             d_info  = nullptr;
-//     cudaErrcheck(cudaMalloc(&d_work, sizeof(cuDoubleComplex) * lwork));
-//     cudaErrcheck(cudaMalloc(&d_info,  sizeof(int)));
-
-//     cusolverErrcheck(cusolverDnZhegvdx(
-//         cusolver_handle, itype, jobz, uplo, n,
-//         reinterpret_cast<cuDoubleComplex*>(A), lda,
-//         reinterpret_cast<cuDoubleComplex*>(B), ldb,
-//         il, iu, abstol, &m_out, W,
-//         reinterpret_cast<cuDoubleComplex*>(Z), ldz, d_work, lwork, d_info));
-
-//     int h_info = 0;
-//     cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
-//     if (h_info != 0) {
-//         throw std::runtime_error("hegvx: cusolverDnZhegvdx failed with info = " + std::to_string(h_info));
-//     }
-
-//     cudaErrcheck(cudaFree(d_work));
-//     cudaErrcheck(cudaFree(d_info));
-// }
+// =====================================================================================================
+// hegvd x: Compute selected eigenvalues and eigenvectors of generalized Hermitian-definite eigenproblem
+//          A * x = lambda * B * x
+// =====================================================================================================
+
+// --- float ---
+static inline
+void hegvdx(
+    cusolverDnHandle_t& cusolver_handle,
+    const int itype,           // 1: A*x = lambda*B*x
+    const char jobz,           // 'V' or 'N'
+    const char range,          // 'I', 'V', 'A'
+    const char uplo,           // 'U' or 'L'
+    const int n,
+    const int lda,
+    float* d_A,                // Input matrix A (device)
+    float* d_B,                // Input matrix B (device)
+    const float vl,            // for RANGE='V'
+    const float vu,
+    const int il,              // for RANGE='I'
+    const int iu,
+    int* h_meig,               // output: number of eigenvalues found
+    float* d_eigen_val,        // output: eigenvalues
+    float* d_eigen_vec         // output: eigenvectors (if jobz='V'), size ldz × m
+) {
+    int lwork = 0;
+    int *d_info = nullptr;
+    float *d_work = nullptr;
+
+    // Allocate device info
+    cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
+
+    // Copy A and B to temporary buffers since sygvdx may modify them
+    float *d_A_copy = nullptr, *d_B_copy = nullptr;
+    cudaErrcheck(cudaMalloc((void**)&d_A_copy, sizeof(float) * n * lda));
+    cudaErrcheck(cudaMalloc((void**)&d_B_copy, sizeof(float) * n * lda));
+    cudaErrcheck(cudaMemcpy(d_A_copy, d_A, sizeof(float) * n * lda, cudaMemcpyDeviceToDevice));
+    cudaErrcheck(cudaMemcpy(d_B_copy, d_B, sizeof(float) * n * lda, cudaMemcpyDeviceToDevice));
+
+    // Set parameters
+    cusolverEigType_t itype_t = cublas_eig_type(itype);
+    cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
+    cusolverEigRange_t range_t = cublas_eig_range(range);
+    cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
+
+    // Query workspace size
+    cusolverErrcheck(cusolverDnSsygvdx_bufferSize(
+        cusolver_handle,
+        itype_t, jobz_t, range_t, uplo_t,
+        n,
+        d_A_copy, lda,
+        d_B_copy, lda,
+        vl, vu, il, iu,
+        h_meig,
+        d_eigen_val,
+        &lwork
+    ));
+
+    // Allocate workspace
+    cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(float) * lwork));
+
+    // Main call
+    cusolverErrcheck(cusolverDnSsygvdx(
+        cusolver_handle,
+        itype_t, jobz_t, range_t, uplo_t,
+        n,
+        d_A_copy, lda,
+        d_B_copy, lda,
+        vl, vu, il, iu,
+        h_meig,
+        d_eigen_val,
+        d_work, lwork,
+        d_info
+    ));
+
+    // Check result
+    int h_info = 0;
+    cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if (h_info < 0) {
+        throw std::runtime_error("hegvdx (float): illegal argument #" + std::to_string(-h_info));
+    } else if (h_info > 0) {
+        // If h_info <= n: convergence issue in tridiag solver (no vec) OR
+        // If h_info > n: B's leading minor of order (h_info - n) is not positive definite
+        if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
+            throw std::runtime_error("hegvdx (float): failed to converge, " + std::to_string(h_info) + " off-diagonal elements didn't converge");
+        } else if (h_info > n) {
+            throw std::runtime_error("hegvdx (float): leading minor of order " + std::to_string(h_info - n) + " of B is not positive definite");
+        }
+    }
+
+    // If jobz == 'V', copy eigenvectors from A (which now contains Z) to output
+    if (jobz == 'V') {
+        const int m = (*h_meig); // number of eigenvectors computed
+        cudaErrcheck(cudaMemcpy(d_eigen_vec, d_A_copy, sizeof(float) * n * m, cudaMemcpyDeviceToDevice));
+    }
+
+    // Cleanup
+    cudaFree(d_info);
+    cudaFree(d_work);
+    cudaFree(d_A_copy);
+    cudaFree(d_B_copy);
+}
+
+
+// --- double ---
+static inline
+void hegvdx(
+    cusolverDnHandle_t& cusolver_handle,
+    const int itype,
+    const char jobz,
+    const char range,
+    const char uplo,
+    const int n,
+    const int lda,
+    double* d_A,
+    double* d_B,
+    const double vl,
+    const double vu,
+    const int il,
+    const int iu,
+    int* h_meig,
+    double* d_eigen_val,
+    double* d_eigen_vec
+) {
+    int lwork = 0;
+    int *d_info = nullptr;
+    double *d_work = nullptr;
+
+    cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
+
+    double *d_A_copy = nullptr, *d_B_copy = nullptr;
+    cudaErrcheck(cudaMalloc((void**)&d_A_copy, sizeof(double) * n * lda));
+    cudaErrcheck(cudaMalloc((void**)&d_B_copy, sizeof(double) * n * lda));
+    cudaErrcheck(cudaMemcpy(d_A_copy, d_A, sizeof(double) * n * lda, cudaMemcpyDeviceToDevice));
+    cudaErrcheck(cudaMemcpy(d_B_copy, d_B, sizeof(double) * n * lda, cudaMemcpyDeviceToDevice));
+
+    cusolverEigType_t itype_t = cublas_eig_type(itype);
+    cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
+    cusolverEigRange_t range_t = cublas_eig_range(range);
+    cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
+
+    cusolverErrcheck(cusolverDnDsygvdx_bufferSize(
+        cusolver_handle,
+        itype_t, jobz_t, range_t, uplo_t,
+        n,
+        d_A_copy, lda,
+        d_B_copy, lda,
+        vl, vu, il, iu,
+        h_meig,
+        d_eigen_val,
+        &lwork
+    ));
+
+    cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(double) * lwork));
+
+    cusolverErrcheck(cusolverDnDsygvdx(
+        cusolver_handle,
+        itype_t, jobz_t, range_t, uplo_t,
+        n,
+        d_A_copy, lda,
+        d_B_copy, lda,
+        vl, vu, il, iu,
+        h_meig,
+        d_eigen_val,
+        d_work, lwork,
+        d_info
+    ));
+
+    int h_info = 0;
+    cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if (h_info < 0) {
+        throw std::runtime_error("hegvdx (double): illegal argument #" + std::to_string(-h_info));
+    } else if (h_info > 0) {
+        if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
+            throw std::runtime_error("hegvdx (double): failed to converge, " + std::to_string(h_info) + " off-diagonal elements didn't converge");
+        } else if (h_info > n) {
+            throw std::runtime_error("hegvdx (double): leading minor of order " + std::to_string(h_info - n) + " of B is not positive definite");
+        }
+    }
+
+    if (jobz == 'V') {
+        const int m = (*h_meig);
+        cudaErrcheck(cudaMemcpy(d_eigen_vec, d_A_copy, sizeof(double) * n * m, cudaMemcpyDeviceToDevice));
+    }
+
+    cudaFree(d_info);
+    cudaFree(d_work);
+    cudaFree(d_A_copy);
+    cudaFree(d_B_copy);
+}
+
+
+// --- complex<float> ---
+static inline
+void hegvdx(
+    cusolverDnHandle_t& cusolver_handle,
+    const int itype,
+    const char jobz,
+    const char range,
+    const char uplo,
+    const int n,
+    const int lda,
+    std::complex<float>* d_A,
+    std::complex<float>* d_B,
+    const float vl,
+    const float vu,
+    const int il,
+    const int iu,
+    int* h_meig,
+    float* d_eigen_val,
+    std::complex<float>* d_eigen_vec
+) {
+    int lwork = 0;
+    int *d_info = nullptr;
+    cuComplex *d_work = nullptr;
+
+    cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
+
+    cuComplex *d_A_copy = nullptr, *d_B_copy = nullptr;
+    cudaErrcheck(cudaMalloc((void**)&d_A_copy, sizeof(cuComplex) * n * lda));
+    cudaErrcheck(cudaMalloc((void**)&d_B_copy, sizeof(cuComplex) * n * lda));
+    cudaErrcheck(cudaMemcpy(d_A_copy, reinterpret_cast<cuComplex*>(d_A), sizeof(cuComplex) * n * lda, cudaMemcpyDeviceToDevice));
+    cudaErrcheck(cudaMemcpy(d_B_copy, reinterpret_cast<cuComplex*>(d_B), sizeof(cuComplex) * n * lda, cudaMemcpyDeviceToDevice));
+
+    cusolverEigType_t itype_t = cublas_eig_type(itype);
+    cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
+    cusolverEigRange_t range_t = cublas_eig_range(range);
+    cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
+
+    cusolverErrcheck(cusolverDnChegvdx_bufferSize(
+        cusolver_handle,
+        itype_t, jobz_t, range_t, uplo_t,
+        n,
+        d_A_copy, lda,
+        d_B_copy, lda,
+        vl, vu, il, iu,
+        h_meig,
+        d_eigen_val,
+        &lwork
+    ));
+
+    cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuComplex) * lwork));
+
+    cusolverErrcheck(cusolverDnChegvdx(
+        cusolver_handle,
+        itype_t, jobz_t, range_t, uplo_t,
+        n,
+        d_A_copy, lda,
+        d_B_copy, lda,
+        vl, vu, il, iu,
+        h_meig,
+        d_eigen_val,
+        d_work, lwork,
+        d_info
+    ));
+
+    int h_info = 0;
+    cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if (h_info < 0) {
+        throw std::runtime_error("hegvdx (complex<float>): illegal argument #" + std::to_string(-h_info));
+    } else if (h_info > 0) {
+        if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
+            throw std::runtime_error("hegvdx (complex<float>): failed to converge, " + std::to_string(h_info) + " off-diagonal elements didn't converge");
+        } else if (h_info > n) {
+            throw std::runtime_error("hegvdx (complex<float>): leading minor of order " + std::to_string(h_info - n) + " of B is not positive definite");
+        }
+    }
+
+    if (jobz == 'V') {
+        const int m = (*h_meig);
+        cudaErrcheck(cudaMemcpy(reinterpret_cast<cuComplex*>(d_eigen_vec), d_A_copy, sizeof(cuComplex) * n * m, cudaMemcpyDeviceToDevice));
+    }
+
+    cudaFree(d_info);
+    cudaFree(d_work);
+    cudaFree(d_A_copy);
+    cudaFree(d_B_copy);
+}
+
+
+// --- complex<double> ---
+static inline
+void hegvdx(
+    cusolverDnHandle_t& cusolver_handle,
+    const int itype,
+    const char jobz,
+    const char range,
+    const char uplo,
+    const int n,
+    const int lda,
+    std::complex<double>* d_A,
+    std::complex<double>* d_B,
+    const double vl,
+    const double vu,
+    const int il,
+    const int iu,
+    int* h_meig,
+    double* d_eigen_val,
+    std::complex<double>* d_eigen_vec
+) {
+    int lwork = 0;
+    int *d_info = nullptr;
+    cuDoubleComplex *d_work = nullptr;
+
+    cudaErrcheck(cudaMalloc((void**)&d_info, sizeof(int)));
+
+    cuDoubleComplex *d_A_copy = nullptr, *d_B_copy = nullptr;
+    cudaErrcheck(cudaMalloc((void**)&d_A_copy, sizeof(cuDoubleComplex) * n * lda));
+    cudaErrcheck(cudaMalloc((void**)&d_B_copy, sizeof(cuDoubleComplex) * n * lda));
+    cudaErrcheck(cudaMemcpy(d_A_copy, reinterpret_cast<cuDoubleComplex*>(d_A), sizeof(cuDoubleComplex) * n * lda, cudaMemcpyDeviceToDevice));
+    cudaErrcheck(cudaMemcpy(d_B_copy, reinterpret_cast<cuDoubleComplex*>(d_B), sizeof(cuDoubleComplex) * n * lda, cudaMemcpyDeviceToDevice));
+
+    cusolverEigType_t itype_t = cublas_eig_type(itype);
+    cusolverEigMode_t jobz_t = cublas_eig_mode(jobz);
+    cusolverEigRange_t range_t = cublas_eig_range(range);
+    cublasFillMode_t uplo_t = cublas_fill_mode(uplo);
+
+    cusolverErrcheck(cusolverDnZhegvdx_bufferSize(
+        cusolver_handle,
+        itype_t, jobz_t, range_t, uplo_t,
+        n,
+        d_A_copy, lda,
+        d_B_copy, lda,
+        vl, vu, il, iu,
+        h_meig,
+        d_eigen_val,
+        &lwork
+    ));
+
+    cudaErrcheck(cudaMalloc((void**)&d_work, sizeof(cuDoubleComplex) * lwork));
+
+    cusolverErrcheck(cusolverDnZhegvdx(
+        cusolver_handle,
+        itype_t, jobz_t, range_t, uplo_t,
+        n,
+        d_A_copy, lda,
+        d_B_copy, lda,
+        vl, vu, il, iu,
+        h_meig,
+        d_eigen_val,
+        d_work, lwork,
+        d_info
+    ));
+
+    int h_info = 0;
+    cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if (h_info < 0) {
+        throw std::runtime_error("hegvdx (complex<double>): illegal argument #" + std::to_string(-h_info));
+    } else if (h_info > 0) {
+        if (jobz_t == CUSOLVER_EIG_MODE_NOVECTOR && h_info <= n) {
+            throw std::runtime_error("hegvdx (complex<double>): failed to converge, " + std::to_string(h_info) + " off-diagonal elements didn't converge");
+        } else if (h_info > n) {
+            throw std::runtime_error("hegvdx (complex<double>): leading minor of order " + std::to_string(h_info - n) + " of B is not positive definite");
+        }
+    }
+
+    if (jobz == 'V') {
+        const int m = (*h_meig);
+        cudaErrcheck(cudaMemcpy(reinterpret_cast<cuDoubleComplex*>(d_eigen_vec), d_A_copy, sizeof(cuDoubleComplex) * n * m, cudaMemcpyDeviceToDevice));
+    }
+
+    cudaFree(d_info);
+    cudaFree(d_work);
+    cudaFree(d_A_copy);
+    cudaFree(d_B_copy);
+}
+
 
 // --- getrf
 static inline

From 4ddd934b1941f110f0f62fa838a438b87cc7d776 Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Mon, 20 Oct 2025 01:42:19 +0800
Subject: [PATCH 09/17] Fix hip hegvd

---
 .../module_container/ATen/kernels/rocm/lapack.hip.cu          | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu b/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu
index 93f0fa481b..07572a657a 100644
--- a/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu
+++ b/source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu
@@ -120,12 +120,8 @@ struct lapack_hegvd<T, DEVICE_GPU> {
     void operator()(
         const int dim,
         const int lda,
-        const int& itype,
-        const char& jobz,
-        const char& uplo,
         T* Mat_A,
         T* Mat_B,
-        const int& dim,
         Real* eigen_val,
         T *eigen_vec)
     {

From 75dbbb1fbd5a1875089d641ba4478206a2197d5e Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Mon, 20 Oct 2025 02:36:47 +0800
Subject: [PATCH 10/17] Fix hegvd to prevent B from being overwritten by
 Cholesky

---
 .../module_container/ATen/kernels/cuda/lapack.cu  | 15 +++++++++++++--
 .../module_container/ATen/kernels/lapack.cpp      |  5 ++++-
 .../module_container/ATen/kernels/lapack.h        |  3 ++-
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
index 34fca37898..cbd10a67a2 100644
--- a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
+++ b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
@@ -147,11 +147,19 @@ struct lapack_hegvd<T, DEVICE_GPU> {
     {
         const int itype = 1;
         const char jobz = 'V';
-        const char uplo = 'U';
+        const char uplo = 'L';
         cudaErrcheck(cudaMemcpy(eigen_vec, Mat_A, sizeof(T) * dim * lda, cudaMemcpyDeviceToDevice));
+
+        // prevent B from being overwritten by Cholesky
+        T *d_B_backup = nullptr;
+        cudaErrcheck(cudaMalloc(&d_B_backup, sizeof(T) * dim * lda));
+        cudaErrcheck(cudaMemcpy(d_B_backup, Mat_B, sizeof(T) * dim * lda, cudaMemcpyDeviceToDevice));
+
         cuSolverConnector::hegvd(cusolver_handle, itype, jobz, uplo, dim,
-                eigen_vec, lda, Mat_B, lda,
+                eigen_vec, lda,
+                d_B_backup, lda,
                 eigen_val);
+        cudaErrcheck(cudaFree(d_B_backup));
     }
 };
 
@@ -172,6 +180,9 @@ struct lapack_hegvx<T, DEVICE_GPU> {
         const char range = 'I';
         const char uplo = 'U';
         int meig = 0;
+
+        // this hegvdx will protect the input A, B from being overwritten
+        // and write the eigenvectors into eigen_vec.
         cuSolverConnector::hegvdx(cusolver_handle,
             itype, jobz, range, uplo,
             n, lda, A, B,
diff --git a/source/source_base/module_container/ATen/kernels/lapack.cpp b/source/source_base/module_container/ATen/kernels/lapack.cpp
index 185707ff8f..226710810d 100644
--- a/source/source_base/module_container/ATen/kernels/lapack.cpp
+++ b/source/source_base/module_container/ATen/kernels/lapack.cpp
@@ -215,6 +215,9 @@ struct lapack_hegvd<T, DEVICE_CPU> {
         // eigen_vec = Mat_A
         std::copy(Mat_A, Mat_A + dim*lda, eigen_vec);
 
+        Tensor aux_B(DataTypeToEnum<T>::value, DeviceType::CpuDevice, {dim * lda});
+        std::copy(Mat_B, Mat_B + dim * lda, aux_B.data<T>());
+
         const int itype = 1;
         const char jobz = 'V';
         const char uplo = 'L';
@@ -232,7 +235,7 @@ struct lapack_hegvd<T, DEVICE_CPU> {
         iwork.zero();
 
         // After this, eigen_vec will contain the matrix Z of eigenvectors
-        lapackConnector::hegvd(itype, jobz, uplo, dim, eigen_vec, lda, Mat_B, lda, eigen_val, work.data<T>(), lwork, rwork.data<Real>(), lrwork, iwork.data<int>(), liwork, info);
+        lapackConnector::hegvd(itype, jobz, uplo, dim, eigen_vec, lda, aux_B.data<T>(), lda, eigen_val, work.data<T>(), lwork, rwork.data<Real>(), lrwork, iwork.data<int>(), liwork, info);
         if (info != 0) {
             throw std::runtime_error("hegvd failed with info = " + std::to_string(info));
         }
diff --git a/source/source_base/module_container/ATen/kernels/lapack.h b/source/source_base/module_container/ATen/kernels/lapack.h
index 2914ee9fe6..8ffc2631a4 100644
--- a/source/source_base/module_container/ATen/kernels/lapack.h
+++ b/source/source_base/module_container/ATen/kernels/lapack.h
@@ -76,7 +76,7 @@ struct lapack_heevx {
      *
      * @note
      * See LAPACK ZHEEVX or CHEEVX documentation for more details.
-     *
+     * This routine allocates auxiliary memory inside to prevent input matrix from being destroyed.
      */
     void operator()(
         const int dim,
@@ -137,6 +137,7 @@ struct lapack_hegvx {
      *
      * @note
      * See LAPACK ZHEGVX doc for more details.
+     * This routine allocates auxiliary memory inside to prevent input matrix from being destroyed.
      */
     void operator()(
         const int n,

From 569a79cfa0ea0030b66d7119030224d8c44c7714 Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Mon, 20 Oct 2025 11:45:14 +0800
Subject: [PATCH 11/17] Switch on ATen/kernels/test

---
 source/source_base/module_container/ATen/kernels/lapack.h | 1 +
 source/source_base/module_container/CMakeLists.txt        | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/source/source_base/module_container/ATen/kernels/lapack.h b/source/source_base/module_container/ATen/kernels/lapack.h
index 8ffc2631a4..b570bc122b 100644
--- a/source/source_base/module_container/ATen/kernels/lapack.h
+++ b/source/source_base/module_container/ATen/kernels/lapack.h
@@ -105,6 +105,7 @@ struct lapack_hegvd {
      * @note
      * See LAPACK ZHEGVD or CHEGVD documentation for more details.
      * This function assumes that A and B have the same leading dimensions, lda.
+     * This function copies B to auxiliary memory to avoid being overwritten.
      */
     void operator()(
         const int n,
diff --git a/source/source_base/module_container/CMakeLists.txt b/source/source_base/module_container/CMakeLists.txt
index 80a8dce5fc..4a2be0ef29 100644
--- a/source/source_base/module_container/CMakeLists.txt
+++ b/source/source_base/module_container/CMakeLists.txt
@@ -21,7 +21,7 @@ endif()
 
 add_library(container STATIC ${ATen_CPU_SRCS} ${ATen_CUDA_SRCS})
 
-target_link_libraries(container PUBLIC 
+target_link_libraries(container PUBLIC
   ${ATen_CPU_DEPENDENCY_LIBS} ${ATen_CUDA_DEPENDENCY_LIBS} ${ATen_ROCM_DEPENDENCY_LIBS})
 
 if(ENABLE_COVERAGE)
@@ -31,5 +31,6 @@ endif()
 if(BUILD_TESTING)
   if(ENABLE_MPI)
     add_subdirectory(test)
+    add_subdirectory(ATen/kernels/test)
   endif()
 endif()

From abf8340bbf2e96fc1afa2f36a0c78b2876ed336b Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Mon, 20 Oct 2025 15:12:06 +0800
Subject: [PATCH 12/17] Change container test TARGET name to be auto-run by CI

---
 .../module_container/ATen/kernels/CMakeLists.txt            | 2 +-
 .../module_container/ATen/kernels/test/CMakeLists.txt       | 4 ++--
 .../module_container/ATen/ops/test/CMakeLists.txt           | 6 +++---
 source/source_base/module_container/CMakeLists.txt          | 1 -
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/source/source_base/module_container/ATen/kernels/CMakeLists.txt b/source/source_base/module_container/ATen/kernels/CMakeLists.txt
index 92f7a9fccc..9faecec778 100644
--- a/source/source_base/module_container/ATen/kernels/CMakeLists.txt
+++ b/source/source_base/module_container/ATen/kernels/CMakeLists.txt
@@ -16,4 +16,4 @@ if(BUILD_TESTING)
   if(ENABLE_MPI)
     add_subdirectory(test)
   endif()
-endif()
\ No newline at end of file
+endif()
diff --git a/source/source_base/module_container/ATen/kernels/test/CMakeLists.txt b/source/source_base/module_container/ATen/kernels/test/CMakeLists.txt
index 2e07d84e6a..0ca3d97c26 100644
--- a/source/source_base/module_container/ATen/kernels/test/CMakeLists.txt
+++ b/source/source_base/module_container/ATen/kernels/test/CMakeLists.txt
@@ -1,8 +1,8 @@
 AddTest(
-  TARGET container_kernels_uts
+  TARGET MODULE_BASE_container_kernels_uts
   LIBS parameter ${math_libs}
   SOURCES blas_test.cpp lapack_test.cpp
   memory_test.cpp linalg_test.cpp
 )
 
-target_link_libraries(container_kernels_uts container base device)
\ No newline at end of file
+target_link_libraries(MODULE_BASE_container_kernels_uts container base device)
diff --git a/source/source_base/module_container/ATen/ops/test/CMakeLists.txt b/source/source_base/module_container/ATen/ops/test/CMakeLists.txt
index 0cc2c090ca..d5103acfdd 100644
--- a/source/source_base/module_container/ATen/ops/test/CMakeLists.txt
+++ b/source/source_base/module_container/ATen/ops/test/CMakeLists.txt
@@ -1,7 +1,7 @@
 AddTest(
-  TARGET container_ops_uts
-  LIBS parameter ${math_libs} 
+  TARGET MODULE_BASE_container_ops_uts
+  LIBS parameter ${math_libs}
   SOURCES einsum_op_test.cpp linalg_op_test.cpp ../../kernels/lapack.cpp
 )
 
-target_link_libraries(container_ops_uts container base device)
\ No newline at end of file
+target_link_libraries(MODULE_BASE_container_ops_uts container base device)
diff --git a/source/source_base/module_container/CMakeLists.txt b/source/source_base/module_container/CMakeLists.txt
index 4a2be0ef29..14b5c57c52 100644
--- a/source/source_base/module_container/CMakeLists.txt
+++ b/source/source_base/module_container/CMakeLists.txt
@@ -31,6 +31,5 @@ endif()
 if(BUILD_TESTING)
   if(ENABLE_MPI)
     add_subdirectory(test)
-    add_subdirectory(ATen/kernels/test)
   endif()
 endif()

From 5f05bf434e385816d64c66318cf1ef135b1e5b07 Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Tue, 21 Oct 2025 01:29:49 +0800
Subject: [PATCH 13/17] Add test for heevx

---
 .../ATen/kernels/cuda/lapack.cu               |  4 ++
 .../ATen/kernels/test/lapack_test.cpp         | 57 ++++++++++++++++++-
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
index cbd10a67a2..96b24f243a 100644
--- a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
+++ b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
@@ -6,6 +6,9 @@
 #include <cuda_runtime.h>
 #include <thrust/complex.h>
 
+#include <cassert>
+
+
 namespace container {
 namespace kernels {
 
@@ -112,6 +115,7 @@ struct lapack_heevx<T, DEVICE_GPU> {
         Real *d_eigen_val,
         T *d_eigen_vec)
     {
+        assert(n <= lda);
         // copy d_Mat to d_eigen_vec, and results will be overwritten into d_eigen_vec
         // by cuSolver
         cudaErrcheck(cudaMemcpy(d_eigen_vec, d_Mat, sizeof(T) * n * lda, cudaMemcpyDeviceToDevice));
diff --git a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp
index 8027d89f82..3348ecda3e 100644
--- a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp
+++ b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp
@@ -138,6 +138,59 @@ TYPED_TEST(LapackTest, heevd) {
     EXPECT_EQ(expected_C1, expected_C2);
 }
 
+TYPED_TEST(LapackTest, heevx) {
+    using Type = typename std::tuple_element<0, decltype(TypeParam())>::type;
+    using Real = typename GetTypeReal<Type>::type;
+    using Device = typename std::tuple_element<1, decltype(TypeParam())>::type;
+
+    blas_gemm<Type, Device> gemmCalculator;
+    blas_axpy<Type, Device> axpyCalculator;
+    lapack_heevx<Type, Device> heevxCalculator;
+
+    const int dim = 3;
+    const int neig = 2;  // Compute first 2 eigenvalues
+
+    Tensor A = std::move(Tensor({static_cast<Type>(4.0), static_cast<Type>(1.0), static_cast<Type>(1.0),
+                                 static_cast<Type>(1.0), static_cast<Type>(5.0), static_cast<Type>(3.0),
+                                 static_cast<Type>(1.0), static_cast<Type>(3.0), static_cast<Type>(6.0)}).to_device<Device>());
+
+    Tensor E = std::move(Tensor({static_cast<Real>(0.0), static_cast<Real>(0.0)}).to_device<Device>());
+    Tensor V = A;
+    Tensor expected_C1 = std::move(Tensor({static_cast<Type>(0.0), static_cast<Type>(0.0), static_cast<Type>(0.0),
+                                           static_cast<Type>(0.0), static_cast<Type>(0.0), static_cast<Type>(0.0)}).to_device<Device>());
+    Tensor expected_C2 = expected_C1;
+    expected_C1.zero();
+    expected_C2.zero();
+
+    const char trans = 'N';
+    const int m = 3;
+    const int n = neig;
+    const int k = 3;
+    const Type alpha = static_cast<Type>(1.0);
+    const Type beta  = static_cast<Type>(0.0);
+
+    // Compute first neig eigenvalues and eigenvectors using heevx
+    heevxCalculator(dim, dim, A.data<Type>(), neig, E.data<Real>(), V.data<Type>());
+
+    E = E.to_device<ct::DEVICE_CPU>();
+    const Tensor Alpha = std::move(Tensor({
+            static_cast<Type>(E.data<Real>()[0]),
+            static_cast<Type>(E.data<Real>()[1])}));
+
+    // Check the eigenvalues and eigenvectors
+    // A * x = lambda * x for the first neig eigenvectors
+    // get A*V
+    gemmCalculator(trans, trans, m, n, k, &alpha, A.data<Type>(), m, V.data<Type>(), k, &beta, expected_C1.data<Type>(), m);
+    // get E*V
+    for (int ii = 0; ii < neig; ii++) {
+        axpyCalculator(dim, Alpha.data<Type>() + ii, V.data<Type>() + ii * dim, 1, expected_C2.data<Type>() + ii * dim, 1);
+    }
+    // check that A*V = E*V
+    E = E.to_device<DEVICE_CPU>();
+    V = V.to_device<DEVICE_CPU>();
+
+    EXPECT_EQ(expected_C1, expected_C2);
+}
 
 TYPED_TEST(LapackTest, hegvd) {
     using Type = typename std::tuple_element<0, decltype(TypeParam())>::type;
@@ -189,5 +242,7 @@ TYPED_TEST(LapackTest, hegvd) {
     EXPECT_EQ(expected_C1, expected_C2);
 }
 
-} // namespace op
+
+
+} // namespace kernels
 } // namespace container

From 11c19c89d44987eb530f3ac55c788d82465da797 Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Tue, 21 Oct 2025 01:49:26 +0800
Subject: [PATCH 14/17] Add test for hegvx

---
 .../ATen/kernels/test/lapack_test.cpp         | 75 +++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp
index 3348ecda3e..d2597024ab 100644
--- a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp
+++ b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp
@@ -188,6 +188,19 @@ TYPED_TEST(LapackTest, heevx) {
     // check that A*V = E*V
     E = E.to_device<DEVICE_CPU>();
     V = V.to_device<DEVICE_CPU>();
+    std::cout << "Eigenvalues E: ";
+    for (int i = 0; i < neig; i++) {
+        std::cout << E.data<Real>()[i] << " ";
+    }
+    std::cout << std::endl;
+
+    std::cout << "Eigenvectors V:" << std::endl;
+    for (int i = 0; i < dim; i++) {
+        for (int j = 0; j < neig; j++) {
+            std::cout << V.data<Type>()[i + j * dim] << " ";
+        }
+        std::cout << std::endl;
+    }
 
     EXPECT_EQ(expected_C1, expected_C2);
 }
@@ -242,7 +255,69 @@ TYPED_TEST(LapackTest, hegvd) {
     EXPECT_EQ(expected_C1, expected_C2);
 }
 
+TYPED_TEST(LapackTest, hegvx) {
+    using Type = typename std::tuple_element<0, decltype(TypeParam())>::type;
+    using Real = typename GetTypeReal<Type>::type;
+    using Device = typename std::tuple_element<1, decltype(TypeParam())>::type;
 
+    blas_gemm<Type, Device> gemmCalculator;
+    blas_axpy<Type, Device> axpyCalculator;
+    lapack_hegvx<Type, Device> hegvxCalculator;
+
+    const int dim = 3;
+    const int neig = 2;  // Compute first 2 eigenvalues
+
+    Tensor A = std::move(Tensor({static_cast<Type>(4.0), static_cast<Type>(1.0), static_cast<Type>(1.0),
+                                 static_cast<Type>(1.0), static_cast<Type>(5.0), static_cast<Type>(3.0),
+                                 static_cast<Type>(1.0), static_cast<Type>(3.0), static_cast<Type>(6.0)}).to_device<Device>());
+
+    Tensor B = std::move(Tensor({static_cast<Type>(2.0), static_cast<Type>(0.0), static_cast<Type>(0.0),
+                                 static_cast<Type>(0.0), static_cast<Type>(2.0), static_cast<Type>(0.0),
+                                 static_cast<Type>(0.0), static_cast<Type>(0.0), static_cast<Type>(2.0)}).to_device<Device>());
+
+    Tensor E = std::move(Tensor({static_cast<Real>(0.0), static_cast<Real>(0.0)}).to_device<Device>());
+    Tensor V = A;
+    Tensor expected_C1 = std::move(Tensor({static_cast<Type>(0.0), static_cast<Type>(0.0), static_cast<Type>(0.0),
+                                           static_cast<Type>(0.0), static_cast<Type>(0.0), static_cast<Type>(0.0)}).to_device<Device>());
+    Tensor expected_C2 = expected_C1;
+    Tensor C_temp = expected_C1;
+    expected_C1.zero();
+    expected_C2.zero();
+
+    const char trans = 'N';
+    const int m = 3;
+    const int n = neig;
+    const int k = 3;
+    const Type alpha = static_cast<Type>(1.0);
+    const Type beta  = static_cast<Type>(0.0);
+
+    // Compute first neig eigenvalues and eigenvectors using hegvx
+    hegvxCalculator(dim, dim, A.data<Type>(), B.data<Type>(), neig, E.data<Real>(), V.data<Type>());
+
+    E = E.to_device<ct::DEVICE_CPU>();
+    const Tensor Alpha = std::move(Tensor({
+            static_cast<Type>(E.data<Real>()[0]),
+            static_cast<Type>(E.data<Real>()[1])}));
+
+    // Check the eigenvalues and eigenvectors
+    // A * x = lambda * B * x for the first neig eigenvectors
+    // get A*V
+    gemmCalculator(trans, trans, m, n, k, &alpha, A.data<Type>(), m, V.data<Type>(), k, &beta, expected_C1.data<Type>(), m);
+    // get E * B * V
+    // where B is 2 * eye(3,3)
+    // get C_temp = B * V first
+    gemmCalculator(trans, trans, m, n, k, &alpha, B.data<Type>(), m, V.data<Type>(), k, &beta, C_temp.data<Type>(), m);
+    // then compute C2 = E * B * V
+    for (int ii = 0; ii < neig; ii++) {
+        axpyCalculator(dim, Alpha.data<Type>() + ii, C_temp.data<Type>() + ii * dim, 1, expected_C2.data<Type>() + ii * dim, 1);
+    }
+    // check that A*V = E*V
+    E = E.to_device<DEVICE_CPU>();
+    V = V.to_device<DEVICE_CPU>();
+
+
+    EXPECT_EQ(expected_C1, expected_C2);
+}
 
 } // namespace kernels
 } // namespace container

From 915cc10e489d27557bc13dc8c289dbd263c74af1 Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Tue, 21 Oct 2025 02:16:07 +0800
Subject: [PATCH 15/17] Remove test output code

---
 .../ATen/kernels/test/lapack_test.cpp               | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp
index d2597024ab..a10c3ef883 100644
--- a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp
+++ b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp
@@ -188,19 +188,6 @@ TYPED_TEST(LapackTest, heevx) {
     // check that A*V = E*V
     E = E.to_device<DEVICE_CPU>();
     V = V.to_device<DEVICE_CPU>();
-    std::cout << "Eigenvalues E: ";
-    for (int i = 0; i < neig; i++) {
-        std::cout << E.data<Real>()[i] << " ";
-    }
-    std::cout << std::endl;
-
-    std::cout << "Eigenvectors V:" << std::endl;
-    for (int i = 0; i < dim; i++) {
-        for (int j = 0; j < neig; j++) {
-            std::cout << V.data<Type>()[i + j * dim] << " ";
-        }
-        std::cout << std::endl;
-    }
 
     EXPECT_EQ(expected_C1, expected_C2);
 }

From 1ae41a12f52829a3fe2d1232400a5b83ab35788d Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Tue, 21 Oct 2025 10:40:56 +0800
Subject: [PATCH 16/17] Clean the code and add docs

---
 .../module_container/ATen/kernels/lapack.cpp  |  5 --
 .../module_container/ATen/kernels/lapack.h    | 47 ++++++++++++++-----
 .../ATen/kernels/test/lapack_test.cpp         | 22 ++++-----
 3 files changed, 46 insertions(+), 28 deletions(-)

diff --git a/source/source_base/module_container/ATen/kernels/lapack.cpp b/source/source_base/module_container/ATen/kernels/lapack.cpp
index 226710810d..0c3e72d76c 100644
--- a/source/source_base/module_container/ATen/kernels/lapack.cpp
+++ b/source/source_base/module_container/ATen/kernels/lapack.cpp
@@ -3,7 +3,6 @@
 
 #include <base/third_party/lapack.h>
 
-// #include <cstring> // std::memcpy
 #include <algorithm> // std::copy
 #include <complex>
 #include <stdexcept>
@@ -208,10 +207,6 @@ struct lapack_hegvd<T, DEVICE_CPU> {
         // first copy Mat_A to eigen_vec
         // then pass as argument "A" in lapack hegvd
         // and this block of memory will be overwritten by eigenvectors
-        // for (int i = 0; i < dim * lda; ++i){
-        //     eigen_vec[i] = Mat_A[i];
-        // }
-        // std::memcpy(eigen_vec, Mat_A, sizeof(T) * dim * lda);
         // eigen_vec = Mat_A
         std::copy(Mat_A, Mat_A + dim*lda, eigen_vec);
 
diff --git a/source/source_base/module_container/ATen/kernels/lapack.h b/source/source_base/module_container/ATen/kernels/lapack.h
index b570bc122b..b3a4a40c4e 100644
--- a/source/source_base/module_container/ATen/kernels/lapack.h
+++ b/source/source_base/module_container/ATen/kernels/lapack.h
@@ -40,7 +40,18 @@ struct lapack_potrf {
         const int& lda);
 };
 
-
+// ============================================================================
+// Standard Hermitian Eigenvalue Problem Solvers
+// ============================================================================
+// The following structures (lapack_heevd and lapack_heevx) implement solvers
+// for standard Hermitian eigenvalue problems of the form:
+//      A * x = lambda * x
+// where:
+//   - A is a Hermitian matrix
+//   - lambda are the eigenvalues to be computed
+//   - x are the corresponding eigenvectors
+//
+// ============================================================================
 template <typename T, typename Device>
 struct lapack_heevd {
     using Real = typename GetTypeReal<T>::type;
@@ -61,18 +72,15 @@ struct lapack_heevx {
      * This function solves the problem A*x = lambda*x, where A is a Hermitian matrix.
      * It computes a subset of eigenvalues and, optionally, the corresponding eigenvectors.
      *
-     * @param jobz  'N': Compute eigenvalues only; 'V': Compute eigenvalues and eigenvectors.
-     * @param range 'A': All eigenvalues; 'V': Eigenvalues in the half-open interval (vl, vu]; 'I': Eigenvalues with indices il through iu.
-     * @param uplo  'U': Upper triangle of A is stored; 'L': Lower triangle is stored.
      * @param dim   The order of the matrix A. dim >= 0.
-     * @param Mat   On entry, the Hermitian matrix A. On exit, it may be overwritten.
-     * @param vl    Lower bound of the interval to search for eigenvalues if range == 'V'.
-     * @param vu    Upper bound of the interval to search for eigenvalues if range == 'V'.
-     * @param il    Index of the smallest eigenvalue to be returned if range == 'I'.
-     * @param iu    Index of the largest eigenvalue to be returned if range == 'I'.
-     * @param m     Output: The total number of found eigenvalues.
-     * @param eigen_val Array to store the computed eigenvalues in ascending order.
-     * @param eigen_vec If not nullptr and jobz == 'V', array to store the computed eigenvectors.
+     * @param lda   The leading dimension of the array Mat. lda >= max(1, dim).
+     * @param Mat   On entry, the Hermitian matrix A. On exit, A is kept.
+     * @param neig  The number of eigenvalues to be found. 0 <= neig <= dim.
+     * @param eigen_val On normal exit, the first \p neig elements contain the selected
+     *                  eigenvalues in ascending order.
+     * @param eigen_vec If eigen_vec is not nullptr, then on exit it contains the
+     *                  orthonormal eigenvectors of the matrix A. The eigenvectors are stored in
+     *                  the columns of eigen_vec, in the same order as the eigenvalues.
      *
      * @note
      * See LAPACK ZHEEVX or CHEEVX documentation for more details.
@@ -87,6 +95,21 @@ struct lapack_heevx {
         T *eigen_vec);
 };
 
+
+// ============================================================================
+// Generalized Hermitian-definite Eigenvalue Problem Solvers
+// ============================================================================
+// The following structures (lapack_hegvd and lapack_hegvx) implement solvers
+// for generalized Hermitian-definite eigenvalue problems of the form:
+//      A * x = lambda * B * x
+// where:
+//   - A is a Hermitian matrix
+//   - B is a Hermitian positive definite matrix
+//   - lambda are the eigenvalues to be computed
+//   - x are the corresponding eigenvectors
+//
+// ============================================================================
+
 template <typename T, typename Device>
 struct lapack_hegvd {
     using Real = typename GetTypeReal<T>::type;
diff --git a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp
index a10c3ef883..277f6a33b8 100644
--- a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp
+++ b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp
@@ -92,6 +92,9 @@ TYPED_TEST(LapackTest, Potrf) {
     EXPECT_EQ(A, C);
 }
 
+// Test for lapack_heevd and lapack_heevx:
+// Solve a standard eigenvalue problem
+// and check that A*V = V*E
 TYPED_TEST(LapackTest, heevd) {
     using Type = typename std::tuple_element<0, decltype(TypeParam())>::type;
     using Real = typename GetTypeReal<Type>::type;
@@ -179,19 +182,19 @@ TYPED_TEST(LapackTest, heevx) {
 
     // Check the eigenvalues and eigenvectors
     // A * x = lambda * x for the first neig eigenvectors
-    // get A*V
+    // check that A * V = V * E
+    // get A * V
     gemmCalculator(trans, trans, m, n, k, &alpha, A.data<Type>(), m, V.data<Type>(), k, &beta, expected_C1.data<Type>(), m);
-    // get E*V
+    // get V * E
     for (int ii = 0; ii < neig; ii++) {
         axpyCalculator(dim, Alpha.data<Type>() + ii, V.data<Type>() + ii * dim, 1, expected_C2.data<Type>() + ii * dim, 1);
-    }
-    // check that A*V = E*V
-    E = E.to_device<DEVICE_CPU>();
-    V = V.to_device<DEVICE_CPU>();
 
     EXPECT_EQ(expected_C1, expected_C2);
 }
 
+// Test for lapack_hegvd and lapack_hegvx
+// Solve a generalized eigenvalue problem
+// and check that A * v = e * B * v
 TYPED_TEST(LapackTest, hegvd) {
     using Type = typename std::tuple_element<0, decltype(TypeParam())>::type;
     using Real = typename GetTypeReal<Type>::type;
@@ -288,7 +291,8 @@ TYPED_TEST(LapackTest, hegvx) {
 
     // Check the eigenvalues and eigenvectors
     // A * x = lambda * B * x for the first neig eigenvectors
-    // get A*V
+    // check that A * V = E * B * V
+    // get A * V
     gemmCalculator(trans, trans, m, n, k, &alpha, A.data<Type>(), m, V.data<Type>(), k, &beta, expected_C1.data<Type>(), m);
     // get E * B * V
     // where B is 2 * eye(3,3)
@@ -298,10 +302,6 @@ TYPED_TEST(LapackTest, hegvx) {
     for (int ii = 0; ii < neig; ii++) {
         axpyCalculator(dim, Alpha.data<Type>() + ii, C_temp.data<Type>() + ii * dim, 1, expected_C2.data<Type>() + ii * dim, 1);
     }
-    // check that A*V = E*V
-    E = E.to_device<DEVICE_CPU>();
-    V = V.to_device<DEVICE_CPU>();
-
 
     EXPECT_EQ(expected_C1, expected_C2);
 }

From ac973429b3b8e13481b7e643d8430447a41dcf50 Mon Sep 17 00:00:00 2001
From: Chen Nuo <49788094+Cstandardlib@users.noreply.github.com>
Date: Tue, 21 Oct 2025 10:56:12 +0800
Subject: [PATCH 17/17] Fix mismatched parentheses

---
 .../module_container/ATen/kernels/test/lapack_test.cpp           | 1 +
 1 file changed, 1 insertion(+)

diff --git a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp
index 277f6a33b8..acd903bb60 100644
--- a/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp
+++ b/source/source_base/module_container/ATen/kernels/test/lapack_test.cpp
@@ -188,6 +188,7 @@ TYPED_TEST(LapackTest, heevx) {
     // get V * E
     for (int ii = 0; ii < neig; ii++) {
         axpyCalculator(dim, Alpha.data<Type>() + ii, V.data<Type>() + ii * dim, 1, expected_C2.data<Type>() + ii * dim, 1);
+    }
 
     EXPECT_EQ(expected_C1, expected_C2);
 }