deepmodeling
diff --git a/‎source/source_base/module_container/ATen/kernels/cuda/lapack.cu‎
Lines changed: 75 additions & 76 deletions b/‎source/source_base/module_container/ATen/kernels/cuda/lapack.cu‎
Lines changed: 75 additions & 76 deletions
diff --git a/‎source/source_base/module_container/ATen/kernels/lapack.cpp‎
Lines changed: 4 additions & 0 deletions b/‎source/source_base/module_container/ATen/kernels/lapack.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎source/source_base/module_container/ATen/kernels/lapack.h‎
Lines changed: 14 additions & 0 deletions b/‎source/source_base/module_container/ATen/kernels/lapack.h‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎source/source_base/module_container/ATen/kernels/test/lapack_test.cpp‎
Lines changed: 68 additions & 0 deletions b/‎source/source_base/module_container/ATen/kernels/test/lapack_test.cpp‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎source/source_base/module_container/base/third_party/blas.h‎
Lines changed: 1 addition & 1 deletion b/‎source/source_base/module_container/base/third_party/blas.h‎
Lines changed: 1 addition & 1 deletion
@@ -122,93 +122,89 @@ struct lapack_getri<T, DEVICE_GPU> {
 
 
 template <typename T>
-struct lapack_getrf_inplace<T, DEVICE_GPU> {
-    void operator(){
+struct lapack_geqrf_inplace<T, DEVICE_GPU> {
+    void operator()(
         const int m,
         const int n,
-        T *A,
+        T *d_A,
         const int lda)
     {
         const int k = std::min(m, n);
 
-        // 1. Allocate tau on device
+        // Allocate tau on device
         T *d_tau;
         cudaErrcheck(cudaMalloc(&d_tau, sizeof(T) * k));
 
-        // 2. Query for workspace size
-        int lwork = 0;
-        int *d_info;
-        cudaErrcheck(cudaMalloc(&d_info, sizeof(int)));
-
-        // geqrf: workspace query
-        cuSolverConnector::geqrf(cusolverH, m, n, d_A, lda, d_tau, nullptr, -1, d_info);
-        // Note: cuSOLVER uses nullptr for query, result returned via lwork
-        // But we need to call it with real pointer to get lwork
-        T work_query;
-        cuSolverConnector::geqrf(cusolverH, m, n, d_A, lda, d_tau, &work_query, -1, d_info);
-
-        // In practice, we use helper function to get lwork
-        // Or use magma for better interface
-        // Let's assume we have a way to get lwork
-        // For now, do a dummy call to get it
-        size_t workspaceInBytes = 0;
-        cusolverErrcheck(cusolverDnXgeqrf_bufferSize(
-            cusolverH, m, n,
-            getCudaDataType<T>::type, d_A, lda,
-            getCudaDataType<T>::type, // for tau
-            CUDA_R_32F, // numerical precision
-            CUSOLVER_WORKSPACE_QUERY_USE_MAX, &workspaceInBytes));
-
-        lwork = static_cast<int>(workspaceInBytes / sizeof(T));
-
-        // Allocate workspace
-        T *d_work;
-        cudaErrcheck(cudaMalloc(&d_work, sizeof(T) * lwork));
-
-        // 3. Perform geqrf
-        cusolverErrcheck(cusolverDnXgeqrf(
-            cusolverH, m, n,
-            getCudaDataType<T>::type, d_A, lda,
-            d_tau,
-            getCudaDataType<T>::type,
-            d_work, lwork * sizeof(T),
-            d_info));
-
-        int info;
-        cudaErrcheck(cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
-        if (info != 0) {
-            throw std::runtime_error("cuSOLVER geqrf failed with info = " + std::to_string(info));
-        }
+        cuSolverConnector::geqrf(cusolver_handle, m, n, d_A, lda, d_tau);
 
-        // 4. Generate Q using orgqr
-        // Query workspace for orgqr
-        cusolverErrcheck(cusolverDnXorgqr_bufferSize(
-            cusolverH, m, n, k,
-            getCudaDataType<T>::type, d_A, lda,
-            getCudaDataType<T>::type, d_tau,
-            CUDA_R_32F,
-            CUSOLVER_WORKSPACE_QUERY_USE_MAX, &workspaceInBytes));
-
-        lwork = static_cast<int>(workspaceInBytes / sizeof(T));
-        cudaErrcheck(cudaRealloc(&d_work, sizeof(T) * lwork)); // or realloc
-
-        // orgqr: generate Q
-        cusolverErrcheck(cusolverDnXorgqr(
-            cusolverH, m, n, k,
-            getCudaDataType<T>::type, d_A, lda,
-            getCudaDataType<T>::type, d_tau,
-            d_work, lwork * sizeof(T),
-            d_info));
-
-        cudaErrcheck(cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
-        if (info != 0) {
-            throw std::runtime_error("cuSOLVER orgqr failed with info = " + std::to_string(info));
-        }
+        cuSolverConnector::orgqr(cusolver_handle, m, n, k, d_A, lda, d_tau);
 
-        // Clean up
         cudaErrcheck(cudaFree(d_tau));
-        cudaErrcheck(cudaFree(d_work));
-        cudaErrcheck(cudaFree(d_info));
+
+        // // geqrf: workspace query
+
+        // // In practice, we use helper function to get lwork
+        // // Or use magma for better interface
+        // // Let's assume we have a way to get lwork
+        // // For now, do a dummy call to get it
+        // size_t workspaceInBytes = 0;
+        // cusolverErrcheck(cusolverDnXgeqrf_bufferSize(
+        //     cusolverH, m, n,
+        //     getCudaDataType<T>::type, d_A, lda,
+        //     getCudaDataType<T>::type, // for tau
+        //     CUDA_R_32F, // numerical precision
+        //     CUSOLVER_WORKSPACE_QUERY_USE_MAX, &workspaceInBytes));
+
+        // lwork = static_cast<int>(workspaceInBytes / sizeof(T));
+
+        // // Allocate workspace
+        // T *d_work;
+        // cudaErrcheck(cudaMalloc(&d_work, sizeof(T) * lwork));
+
+        // // 3. Perform geqrf
+        // cusolverErrcheck(cusolverDnXgeqrf(
+        //     cusolverH, m, n,
+        //     getCudaDataType<T>::type, d_A, lda,
+        //     d_tau,
+        //     getCudaDataType<T>::type,
+        //     d_work, lwork * sizeof(T),
+        //     d_info));
+
+        // int info;
+        // cudaErrcheck(cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+        // if (info != 0) {
+        //     throw std::runtime_error("cuSOLVER geqrf failed with info = " + std::to_string(info));
+        // }
+
+        // // 4. Generate Q using orgqr
+        // // Query workspace for orgqr
+        // cusolverErrcheck(cusolverDnXorgqr_bufferSize(
+        //     cusolverH, m, n, k,
+        //     getCudaDataType<T>::type, d_A, lda,
+        //     getCudaDataType<T>::type, d_tau,
+        //     CUDA_R_32F,
+        //     CUSOLVER_WORKSPACE_QUERY_USE_MAX, &workspaceInBytes));
+
+        // lwork = static_cast<int>(workspaceInBytes / sizeof(T));
+        // cudaErrcheck(cudaRealloc(&d_work, sizeof(T) * lwork)); // or realloc
+
+        // // orgqr: generate Q
+        // cusolverErrcheck(cusolverDnXorgqr(
+        //     cusolverH, m, n, k,
+        //     getCudaDataType<T>::type, d_A, lda,
+        //     getCudaDataType<T>::type, d_tau,
+        //     d_work, lwork * sizeof(T),
+        //     d_info));
+
+        // cudaErrcheck(cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+        // if (info != 0) {
+        //     throw std::runtime_error("cuSOLVER orgqr failed with info = " + std::to_string(info));
+        // }
+
+        // // Clean up
+        // cudaErrcheck(cudaFree(d_tau));
+        // cudaErrcheck(cudaFree(d_work));
+        // cudaErrcheck(cudaFree(d_info));
     }
 };
 
@@ -391,7 +387,10 @@ template struct lapack_getri<double, DEVICE_GPU>;
 template struct lapack_getri<std::complex<float>,  DEVICE_GPU>;
 template struct lapack_getri<std::complex<double>, DEVICE_GPU>;
 
-
+template struct lapack_geqrf_inplace<float,  DEVICE_GPU>;
+template struct lapack_geqrf_inplace<double, DEVICE_GPU>;
+template struct lapack_geqrf_inplace<std::complex<float>,  DEVICE_GPU>;
+template struct lapack_geqrf_inplace<std::complex<double>, DEVICE_GPU>;
 
 } // namespace kernels
 } // namespace container
@@ -493,6 +493,10 @@ template struct lapack_getrs<double, DEVICE_CPU>;
 template struct lapack_getrs<std::complex<float>, DEVICE_CPU>;
 template struct lapack_getrs<std::complex<double>, DEVICE_CPU>;
 
+template struct lapack_geqrf_inplace<float,  DEVICE_CPU>;
+template struct lapack_geqrf_inplace<double, DEVICE_CPU>;
+template struct lapack_geqrf_inplace<std::complex<float>,  DEVICE_CPU>;
+template struct lapack_geqrf_inplace<std::complex<double>, DEVICE_CPU>;
 
 template struct lapack_heevd<float,  DEVICE_CPU>;
 template struct lapack_heevd<double, DEVICE_CPU>;
 
@@ -67,6 +67,20 @@ struct lapack_getri {
 // that will change input Mat A to orthogonal/unitary matrix Q
 template <typename T, typename Device>
 struct lapack_geqrf_inplace {
+    /**
+     * @brief Perform in-place QR factorization of a matrix using LAPACK's geqrf function.
+     *
+     * This function computes the QR factorization of an m-by-n matrix A as A = Q * R,
+     * where Q is an orthogonal/unitary matrix and R is an upper triangular matrix.
+     * The factorization is performed in-place, meaning the input matrix A will be modified.
+     *
+     * On exit: A is overwritten with the QR factorization Q orthogonal/unitary matrix
+     *
+     * @param m The number of rows in the matrix A. m >= 0
+     * @param n The number of columns in the matrix A. n >= 0
+     * @param A Pointer to the matrix A to be factorized. On exit, contains the QR factorization
+     * @param lda The leading dimension of the matrix A. lda >= max(1, m)
+     */
     void operator()(
         const int m,
         const int n,
 
@@ -92,6 +92,74 @@ TYPED_TEST(LapackTest, Potrf) {
     EXPECT_EQ(A, C);
 }
 
+// lapack_geqrf_inplace,
+// check that QtQ = I
+TYPED_TEST(LapackTest, GeqrfInPlace) {
+    using Type = typename std::tuple_element<0, decltype(TypeParam())>::type;
+    using Device = typename std::tuple_element<1, decltype(TypeParam())>::type;
+
+    lapack_geqrf_inplace<Type, Device> geqrfCalculator;
+
+    const int m = 4;
+    const int n = 3;  // m >= n，Q is m x n column-orthogonal matrix
+    const int lda = m;
+
+    Tensor A_input = std::move(Tensor({
+        static_cast<Type>(1.0), static_cast<Type>(2.0), static_cast<Type>(3.0), static_cast<Type>(4.0),
+        static_cast<Type>(5.0), static_cast<Type>(6.0), static_cast<Type>(7.0), static_cast<Type>(8.0),
+        static_cast<Type>(9.0), static_cast<Type>(10.0), static_cast<Type>(11.0), static_cast<Type>(12.0)
+    }).to_device<Device>());
+
+    Tensor A = A_input; // will be overwritten as Q
+
+    // do geqrf -> get orthogonal Q
+    geqrfCalculator(m, n, A.data<Type>(), lda);
+
+    // check on CPU
+    Tensor Q = A.to_device<DEVICE_CPU>();
+    const Type* Q_data = Q.data<Type>();
+
+    // compute QtQ = Q^T * Q (n x n)
+    Tensor QtQ = Q; // std::move(Tensor(std::vector<Type>(n * n, static_cast<Type>(0.0))).to_device<DEVICE_CPU>());
+    const Type alpha = static_cast<Type>(1.0);
+    const Type beta  = static_cast<Type>(0.0);
+
+    blas_gemm<Type, DEVICE_CPU> gemm;
+    gemm('C', 'N',           // Q^T * Q
+         n, n, m,            //  n x n
+         &alpha,
+         Q_data, lda,        // Q^T
+         Q_data, lda,        // Q
+         &beta,
+         QtQ.data<Type>(), n);
+
+    // Test code: print A
+    std::cout << "A = " << std::endl;
+    for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            std::cout << A_input.to_device<DEVICE_CPU>().data<Type>()[i + j * m] << " ";
+        }
+        std::cout << std::endl;
+    }
+    // Test code: print QtQ
+    std::cout << "QtQ = " << std::endl;
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < n; ++j) {
+            std::cout << QtQ.data<Type>()[i + j * n] << " ";
+        }
+        std::cout << std::endl;
+    }
+
+    // check QtQ
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < n; ++j) {
+            Type expected = (i == j) ? static_cast<Type>(1.0) : static_cast<Type>(0.0);
+            EXPECT_NEAR(std::abs(QtQ.data<Type>()[i + j * n]), std::abs(expected), 1e-5)
+                << "Q^T * Q not identity at (" << i << "," << j << ")";
+        }
+    }
+}
+
 // Test for lapack_heevd and lapack_heevx:
 // Solve a standard eigenvalue problem
 // and check that A*V = V*E
 
@@ -26,7 +26,7 @@ void caxpy_(const int *N, const std::complex<float> *alpha, const std::complex<f
 void zaxpy_(const int *N, const std::complex<double> *alpha, const std::complex<double> *x, const int *incx, std::complex<double> *y, const int *incy);
 
 void scopy_(const int *n, const float *a, const int *incx, float *b, int const *incy);
-void dcopy_(const int *n, const double *a, const *incx, double *b, int const *incy);
+void dcopy_(const int *n, const double *a, const int *incx, double *b, int const *incy);
 void ccopy_(const int *n, const std::complex<float> *a, const int *incx, std::complex<float> *b, int const *incy);
 void zcopy_(const int *n, const std::complex<double> *a, const int *incx, std::complex<double> *b, int const *incy);