deepmodeling
diff --git a/‎source/module_base/module_container/ATen/kernels/cuda/lapack.cu‎
Lines changed: 58 additions & 0 deletions b/‎source/module_base/module_container/ATen/kernels/cuda/lapack.cu‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎source/module_base/module_container/ATen/kernels/lapack.cpp‎
Lines changed: 29 additions & 4 deletions b/‎source/module_base/module_container/ATen/kernels/lapack.cpp‎
Lines changed: 29 additions & 4 deletions
diff --git a/‎source/module_base/module_container/ATen/kernels/lapack.h‎
Lines changed: 14 additions & 4 deletions b/‎source/module_base/module_container/ATen/kernels/lapack.h‎
Lines changed: 14 additions & 4 deletions
@@ -117,6 +117,49 @@ struct lapack_dngvd<T, DEVICE_GPU> {
     }
 };
 
+template <typename T>
+struct lapack_getrf<T, DEVICE_GPU> {
+    void operator()(
+        const int& m,
+        const int& n,
+        T* Mat,
+        const int& lda,
+        int* ipiv)
+    {
+        cuSolverConnector::getrf(cusolver_handle, m, n, Mat, lda, ipiv);
+    }
+};
+
+template <typename T>
+struct lapack_getri<T, DEVICE_GPU> {
+    void operator()(
+        const int& n,
+        T* Mat,
+        const int& lda,
+        const int* ipiv,
+        T* work,
+        const int& lwork)
+    {
+        throw std::runtime_error("cuSOLVER does not provide LU-based matrix inversion interface (getri). To compute the inverse on GPU, use getrs instead.");
+    }
+};
+
+template <typename T>
+struct lapack_getrs<T, DEVICE_GPU> {
+    void operator()(
+        const char& trans,
+        const int& n,
+        const int& nrhs,
+        T* A,
+        const int& lda,
+        const int* ipiv,
+        T* B,
+        const int& ldb)
+    {
+        cuSolverConnector::getrs(cusolver_handle, trans, n, nrhs, A, lda, ipiv, B, ldb);
+    }
+};
+
 template struct set_matrix<float,  DEVICE_GPU>;
 template struct set_matrix<double, DEVICE_GPU>;
 template struct set_matrix<std::complex<float>,  DEVICE_GPU>;
@@ -142,5 +185,20 @@ template struct lapack_dngvd<double, DEVICE_GPU>;
 template struct lapack_dngvd<std::complex<float>,  DEVICE_GPU>;
 template struct lapack_dngvd<std::complex<double>, DEVICE_GPU>;
 
+template struct lapack_getrf<float,  DEVICE_GPU>;
+template struct lapack_getrf<double, DEVICE_GPU>;
+template struct lapack_getrf<std::complex<float>,  DEVICE_GPU>;
+template struct lapack_getrf<std::complex<double>, DEVICE_GPU>;
+
+template struct lapack_getri<float,  DEVICE_GPU>;
+template struct lapack_getri<double, DEVICE_GPU>;
+template struct lapack_getri<std::complex<float>,  DEVICE_GPU>;
+template struct lapack_getri<std::complex<double>, DEVICE_GPU>;
+
+template struct lapack_getrs<float,  DEVICE_GPU>;
+template struct lapack_getrs<double, DEVICE_GPU>;
+template struct lapack_getrs<std::complex<float>,  DEVICE_GPU>;
+template struct lapack_getrs<std::complex<double>, DEVICE_GPU>;
+
 } // namespace kernels
 } // namespace container
@@ -131,9 +131,9 @@ struct lapack_getrf<T, DEVICE_CPU> {
         const int& n,
         T* Mat,
         const int& lda,
-        int* ipiv,
-        int& info)
+        int* ipiv)
     {
+        int info = 0;
         lapackConnector::getrf(m, n, Mat, lda, ipiv, info);
         if (info != 0) {
             throw std::runtime_error("getrf failed with info = " + std::to_string(info));
@@ -149,16 +149,36 @@ struct lapack_getri<T, DEVICE_CPU> {
         const int& lda,
         const int* ipiv,
         T* work,
-        const int& lwork,
-        int& info)
+        const int& lwork)
     {
+        int info = 0;
         lapackConnector::getri(n, Mat, lda, ipiv, work, lwork, info);
         if (info != 0) {
             throw std::runtime_error("getri failed with info = " + std::to_string(info));
         }
     }
 };
 
+template <typename T>
+struct lapack_getrs<T, DEVICE_CPU> {
+    void operator()(
+        const char& trans,
+        const int& n,
+        const int& nrhs,
+        T* A,
+        const int& lda,
+        const int* ipiv,
+        T* B,
+        const int& ldb)
+    {
+        int info = 0;
+        lapackConnector::getrs(trans, n, nrhs, A, lda, ipiv, B, ldb, info);
+        if (info != 0) {
+            throw std::runtime_error("getrs failed with info = " + std::to_string(info));
+        }
+    }
+};
+
 template struct set_matrix<float,  DEVICE_CPU>;
 template struct set_matrix<double, DEVICE_CPU>;
 template struct set_matrix<std::complex<float>,  DEVICE_CPU>;
@@ -194,5 +214,10 @@ template struct lapack_getri<double, DEVICE_CPU>;
 template struct lapack_getri<std::complex<float>, DEVICE_CPU>;
 template struct lapack_getri<std::complex<double>, DEVICE_CPU>;
 
+template struct lapack_getrs<float, DEVICE_CPU>;
+template struct lapack_getrs<double, DEVICE_CPU>;
+template struct lapack_getrs<std::complex<float>, DEVICE_CPU>;
+template struct lapack_getrs<std::complex<double>, DEVICE_CPU>;
+
 } // namespace kernels
 } // namespace container
@@ -73,8 +73,7 @@ struct lapack_getrf {
         const int& n,
         T* Mat,
         const int& lda,
-        int* ipiv,
-        int& info);
+        int* ipiv);
 };
 
 
@@ -86,10 +85,21 @@ struct lapack_getri {
         const int& lda,
         const int* ipiv,
         T* work,
-        const int& lwork,
-        int& info);
+        const int& lwork);
 };
 
+template <typename T, typename Device>
+struct lapack_getrs {
+    void operator()(
+        const char& trans,
+        const int& n,
+        const int& nrhs,
+        T* A,
+        const int& lda,
+        const int* ipiv,
+        T* B,
+        const int& ldb);
+};
 
 #if defined(__CUDA) || defined(__ROCM)
 // TODO: Use C++ singleton to manage the GPU handles