deepmodeling
diff --git a/‎source/source_base/module_container/ATen/kernels/cuda/lapack.cu‎
Lines changed: 25 additions & 6 deletions b/‎source/source_base/module_container/ATen/kernels/cuda/lapack.cu‎
Lines changed: 25 additions & 6 deletions
diff --git a/‎source/source_base/module_container/ATen/kernels/lapack.cpp‎
Lines changed: 105 additions & 13 deletions b/‎source/source_base/module_container/ATen/kernels/lapack.cpp‎
Lines changed: 105 additions & 13 deletions
diff --git a/‎source/source_base/module_container/ATen/kernels/lapack.h‎
Lines changed: 57 additions & 9 deletions b/‎source/source_base/module_container/ATen/kernels/lapack.h‎
Lines changed: 57 additions & 9 deletions
diff --git a/‎source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu‎
Lines changed: 15 additions & 8 deletions b/‎source/source_base/module_container/ATen/kernels/rocm/lapack.hip.cu‎
Lines changed: 15 additions & 8 deletions
@@ -105,18 +105,37 @@ template <typename T>
 struct lapack_hegvd<T, DEVICE_GPU> {
     using Real = typename GetTypeReal<T>::type;
     void operator()(
-        const int& itype,
-        const char& jobz,
-        const char& uplo,
+        const int dim,
+        const int lda,
         T* Mat_A,
         T* Mat_B,
-        const int& dim,
-        Real* eigen_val)
+        Real* eigen_val,
+        T *eigen_vec)
     {
-        cuSolverConnector::hegvd(cusolver_handle, itype, jobz, uplo, dim, Mat_A, dim, Mat_B, dim, eigen_val);
+        const int itype = 1;
+        const char jobz = 'V';
+        const char uplo = 'U';
+        cudaErrcheck(cudaMemcpy(eigen_vec, Mat_A, sizeof(T) * dim * lda, cudaMemcpyDeviceToDevice));
+        cuSolverConnector::hegvd(cusolver_handle, itype, jobz, uplo, dim, eigen_vec, lda, Mat_B, lda, eigen_val);
     }
 };
 
+// template <typename T>
+// struct lapack_hegvx<T, DEVICE_GPU> {
+//     using Real = typename GetTypeReal<T>::type;
+//     void operator()(
+//         const int n,
+//         const int lda,
+//         T *A,
+//         T *B,
+//         const int m,
+//         Real *eigen_val,
+//         T *eigen_vec)
+//     {
+//         cuSolverConnector::hegvx(cusolver_handle, n, lda, A, B, m, eigen_val, eigen_vec);
+//     }
+// };
+//
 template <typename T>
 struct lapack_getrf<T, DEVICE_GPU> {
     void operator()(
 
@@ -2,6 +2,9 @@
 
 #include <base/third_party/lapack.h>
 
+// #include <cstring> // std::memcpy
+#include <algorithm> // std::copy
+
 namespace container {
 namespace kernels {
 
@@ -36,7 +39,7 @@ struct lapack_trtri<T, DEVICE_CPU> {
         const char& diag,
         const int& dim,
         T* Mat,
-        const int& lda) 
+        const int& lda)
     {
         int info = 0;
         lapackConnector::trtri(uplo, diag, dim, Mat, lda, info);
@@ -51,8 +54,8 @@ struct lapack_potrf<T, DEVICE_CPU> {
     void operator()(
         const char& uplo,
         const int& dim,
-        T* Mat, 
-        const int& lda) 
+        T* Mat,
+        const int& lda)
     {
         int info = 0;
         lapackConnector::potrf(uplo, dim, Mat, dim, info);
@@ -85,7 +88,7 @@ struct lapack_heevd<T, DEVICE_CPU> {
         Tensor iwork(DataTypeToEnum<int>::value, DeviceType::CpuDevice, {liwork});
         iwork.zero();
 
-        lapackConnector::heevd(jobz, uplo, dim, Mat, dim, eigen_val,  work.data<T>(), lwork, rwork.data<Real>(), lrwork, iwork.data<int>(), liwork, info);
+        lapackConnector::heevd(jobz, uplo, dim, Mat, dim, eigen_val, work.data<T>(), lwork, rwork.data<Real>(), lrwork, iwork.data<int>(), liwork, info);
         if (info != 0) {
             throw std::runtime_error("heevd failed with info = " + std::to_string(info));
         }
@@ -96,14 +99,26 @@ template <typename T>
 struct lapack_hegvd<T, DEVICE_CPU> {
     using Real = typename GetTypeReal<T>::type;
     void operator()(
-        const int& itype,
-        const char& jobz,
-        const char& uplo,
-        T* Mat_A,
-        T* Mat_B,
-        const int& dim,
-        Real* eigen_val)
+        const int dim,
+        const int lda,
+        T *Mat_A,
+        T *Mat_B,
+        Real *eigen_val,
+        T *eigen_vec)
     {
+        // first copy Mat_A to eigen_vec
+        // then pass as argument "A" in lapack hegvd
+        // and this block of memory will be overwritten by eigenvectors
+        // for (int i = 0; i < dim * lda; ++i){
+        //     eigen_vec[i] = Mat_A[i];
+        // }
+        // std::memcpy(eigen_vec, Mat_A, sizeof(T) * dim * lda);
+        // eigen_vec = Mat_A
+        std::copy(Mat_A, Mat_A + dim*lda, eigen_vec);
+
+        const int itype = 1;
+        const char jobz = 'V';
+        const char uplo = 'U';
         int info = 0;
         int lwork = std::max(2 * dim + dim * dim, 1 + 6 * dim + 2 * dim * dim);
         Tensor work(DataTypeToEnum<T>::value, DeviceType::CpuDevice, {lwork});
@@ -117,13 +132,90 @@ struct lapack_hegvd<T, DEVICE_CPU> {
         Tensor iwork(DataType::DT_INT, DeviceType::CpuDevice, {liwork});
         iwork.zero();
 
-        lapackConnector::hegvd(itype, jobz, uplo, dim, Mat_A, dim, Mat_B, dim, eigen_val, work.data<T>(), lwork, rwork.data<Real>(), lrwork, iwork.data<int>(), liwork, info);
+        // After this, eigen_vec will contain the matrix Z of eigenvectors
+        lapackConnector::hegvd(itype, jobz, uplo, dim, eigen_vec, lda, Mat_B, lda, eigen_val, work.data<T>(), lwork, rwork.data<Real>(), lrwork, iwork.data<int>(), liwork, info);
         if (info != 0) {
             throw std::runtime_error("hegvd failed with info = " + std::to_string(info));
         }
     }
 };
 
+
+// template <typename T>
+// struct lapack_hegvx<T, DEVICE_CPU> {
+//     using Real = typename GetTypeReal<T>::type;
+//     void operator()(
+//         const int n,
+//         const int lda,
+//         T *A,
+//         T *B,
+//         const int m,
+//         Real *eigen_val,
+//         T *eigen_vec)
+//     {
+//         int info = 0;
+
+//         int mm = m;
+
+//         int lwork = -1;
+
+//         T *work = new T[1];
+//         Real *rwork = new Real[7 * n];
+//         int *iwork = new int[5 * n];
+//         int *ifail = new int[n];
+
+//         // set lwork = -1 to query optimal work size
+//         lapackConnector::hegvx(1, // ITYPE = 1:  A*x = (lambda)*B*x
+//             'V', 'I', 'U',
+//             n, A, lda, B, lda,
+//             0.0, 0.0,
+//             1, m, 0.0, mm,
+//             eigen_val, eigen_vec, lda,
+//             work,
+//             lwork,      // lwork = 1, query optimal size.
+//             rwork, iwork, ifail,
+//             info);
+
+//         // !>  If LWORK = -1, then a workspace query is assumed; the routine
+//         // !>  only calculates the optimal size of the WORK array, returns
+//         // !>  this value as the first entry of the WORK array.
+//         lwork = int(get_real(work[0]));
+//         delete[] work;
+//         work = new T[lwork];
+
+//         lapackConnector::hegvx(
+//             1, // ITYPE = 1:  A*x = (lambda)*B*x
+//             'V',    // JOBZ = 'V':  Compute eigenvalues and eigenvectors.
+//             'I',    // RANGE = 'I': the IL-th through IU-th eigenvalues will be found.
+//             'U',    // UPLO = 'U':  Upper triangles of A and B are stored.
+//             n,      // order of the matrices A and B.
+//             A,      // A is COMPLEX*16 array  dimension (LDA, N)
+//             lda,    // leading dimension of the array A.
+//             B,      // B is COMPLEX*16 array, dimension (LDB, N)
+//             lda,    // assume that leading dimension of B is the same as A.
+//             0.0,    // VL, Not referenced if RANGE = 'A' or 'I'.
+//             0.0,    // VU, Not referenced if RANGE = 'A' or 'I'.
+//             1,      // IL: If RANGE='I', the index of the smallest eigenvalue to be returned. 1 <= IL <= IU <= N,
+//             m,      // IU: If RANGE='I', the index of the largest eigenvalue to be returned. 1 <= IL <= IU <= N,
+//             0.0,    // ABSTOL
+//             mm,     // M: The total number of eigenvalues found.  0 <= M <= N. if RANGE = 'I', M = IU-IL+1.
+//             eigen_val,  // W store eigenvalues
+//             eigen_vec,  // Z store eigenvector
+//             lda,        // LDZ: The leading dimension of the array Z.
+//             work,
+//             lwork,
+//             rwork,
+//             iwork,
+//             ifail,
+//             info);
+
+//         delete[] work;
+//         delete[] rwork;
+//         delete[] iwork;
+//         delete[] ifail;
+//     }
+// };
+
 template <typename T>
 struct lapack_getrf<T, DEVICE_CPU> {
     void operator()(
@@ -220,4 +312,4 @@ template struct lapack_getrs<std::complex<float>, DEVICE_CPU>;
 template struct lapack_getrs<std::complex<double>, DEVICE_CPU>;
 
 } // namespace kernels
-} // namespace container
+} // namespace container
@@ -35,7 +35,7 @@ struct lapack_potrf {
     void operator()(
         const char& uplo,
         const int& dim,
-        T* Mat, 
+        T* Mat,
         const int& lda);
 };
 
@@ -55,16 +55,64 @@ struct lapack_heevd {
 template <typename T, typename Device>
 struct lapack_hegvd {
     using Real = typename GetTypeReal<T>::type;
+    /**
+     * @brief Computes all the eigenvalues and, optionally, the eigenvectors of a complex generalized Hermitian-definite eigenproblem.
+     *
+     * This function solves the problem A*x = lambda*B*x, where A and B are Hermitian matrices, and B is also positive definite.
+     *
+     * @param dim The order of the matrices Mat_A and Mat_B. dim >= 0.
+     * @param lda The leading dimension of the arrays Mat_A and Mat_B. lda >= max(1, dim).
+     * @param Mat_A On entry, the Hermitian matrix A. On exit, it may be overwritten.
+     * @param Mat_B On entry, the Hermitian positive definite matrix B. On exit, it may be overwritten.
+     * @param eigen_val Array to store the computed eigenvalues in ascending order.
+     * @param eigen_vec If not nullptr, array to store the computed eigenvectors.
+     *
+     * @note
+     * See LAPACK ZHEGVD or CHEGVD documentation for more details.
+     * This function assumes that A and B have the same leading dimensions, lda.
+     */
     void operator()(
-        const int& itype,
-        const char& jobz,
-        const char& uplo,
-        T* Mat_A,
-        T* Mat_B,
-        const int& dim,
-        Real* eigen_val);
+        const int dim,
+        const int lda,
+        T *Mat_A,
+        T *Mat_B,
+        Real *eigen_val,
+        T *eigen_vec);
 };
 
+// template <typename T, typename Device>
+// struct lapack_hegvx {
+//     using Real = typename GetTypeReal<T>::type;
+//     /**
+//      * @ brief hegvx computes the first m eigenvalues and their corresponding eigenvectors of
+//      * a complex generalized Hermitian-definite eigenproblem.
+//      *
+//      * In this op, the CPU version is implemented through the `hegvx` interface, and the CUDA version
+//      * is implemented through the `evd` interface and acquires the first m eigenpairs
+//      *
+//      * hegvx 'V' 'I' 'U'  is used to compute the first m eigenpairs of the problem
+//      *
+//      * @param n The order of the matrices A and B. n >= 0.
+//      * @param lda The leading dimension of the array A and B. lda >= max(1, n).
+//      * @param A On entry, the Hermitian matrix A. On exit, if info = 0, A contains the matrix Z of eigenvectors.
+//      * @param B On entry, the Hermitian positive definite matrix B. On exit, the triangular factor from the Cholesky factorization of B.
+//      * @param m The number of eigenvalues and eigenvectors to be found. 0 < m <= n.
+//      * @param eigen_val The first m eigenvalues in ascending order.
+//      * @param eigen_vec The first m columns contain the orthonormal eigenvectors of the matrix A corresponding to the selected eigenvalues.
+//      *
+//      * @note
+//      * See LAPACK ZHEGVX doc for more details.
+//      */
+//     void operator()(
+//         const int n,
+//         const int lda,
+//         T *A,
+//         T *B,
+//         const int m,
+//         Real *eigen_val,
+//         T *eigen_vec);
+// };
+
 
 template <typename T, typename Device>
 struct lapack_getrf {
@@ -110,4 +158,4 @@ void destroyGpuSolverHandle(); // destroy cusolver handle
 } // namespace container
 } // namespace kernels
 
-#endif // ATEN_KERNELS_LAPACK_H_
+#endif // ATEN_KERNELS_LAPACK_H_
@@ -28,8 +28,8 @@ void destroyGpuSolverHandle() {
 template <typename T>
 __global__ void set_matrix_kernel(
     const char uplo,
-    T* A, 
-    const int dim) 
+    T* A,
+    const int dim)
 {
     int bid = blockIdx.x;
     int tid = threadIdx.x;
@@ -64,7 +64,7 @@ struct lapack_trtri<T, DEVICE_GPU> {
         const char& diag,
         const int& dim,
         T* Mat,
-        const int& lda) 
+        const int& lda)
     {
         // TODO: trtri is not implemented in this method yet
         // Cause the trtri in cuSolver is not stable for ABACUS!
@@ -82,8 +82,8 @@ struct lapack_potrf<T, DEVICE_GPU> {
     void operator()(
         const char& uplo,
         const int& dim,
-        T* Mat, 
-        const int& lda) 
+        T* Mat,
+        const int& lda)
     {
         // hipSolverConnector::potrf(hipsolver_handle, uplo, dim, Mat, dim);
         std::vector<T> H_Mat(dim * dim, static_cast<T>(0.0));
@@ -118,15 +118,22 @@ template <typename T>
 struct lapack_hegvd<T, DEVICE_GPU> {
     using Real = typename GetTypeReal<T>::type;
     void operator()(
+        const int dim,
+        const int lda,
         const int& itype,
         const char& jobz,
         const char& uplo,
         T* Mat_A,
         T* Mat_B,
         const int& dim,
-        Real* eigen_val)
+        Real* eigen_val,
+        T *eigen_vec)
     {
-        hipSolverConnector::hegvd(hipsolver_handle, itype, jobz, uplo, dim, Mat_A, dim, Mat_B, dim, eigen_val);
+        const int itype = 1;
+        const char jobz = 'V';
+        const char uplo = 'U';
+        hipErrcheck(hipMemcpy(eigen_vec, Mat_A, sizeof(T) * dim * lda, hipMemcpyDeviceToDevice));
+        hipSolverConnector::hegvd(hipsolver_handle, itype, jobz, uplo, dim, Mat_A, lda, Mat_B, lda, eigen_val);
     }
 };
 
@@ -156,4 +163,4 @@ template struct lapack_hegvd<std::complex<float>,  DEVICE_GPU>;
 template struct lapack_hegvd<std::complex<double>, DEVICE_GPU>;
 
 } // namespace kernels
-} // namespace container
+} // namespace container