Add geqrf

Cstandardlib · Cstandardlib · commit 0d3f2d060e56 · 2025-10-27T21:18:41.000+08:00
diff --git a/source/source_base/module_container/ATen/kernels/cuda/lapack.cu b/source/source_base/module_container/ATen/kernels/cuda/lapack.cu
@@ -121,6 +121,97 @@ struct lapack_getri<T, DEVICE_GPU> {
 };
 
 
+template <typename T>
+struct lapack_getrf_inplace<T, DEVICE_GPU> {
+    void operator(){
+        const int m,
+        const int n,
+        T *A,
+        const int lda)
+    {
+        const int k = std::min(m, n);
+
+        // 1. Allocate tau on device
+        T *d_tau;
+        cudaErrcheck(cudaMalloc(&d_tau, sizeof(T) * k));
+
+        // 2. Query for workspace size
+        int lwork = 0;
+        int *d_info;
+        cudaErrcheck(cudaMalloc(&d_info, sizeof(int)));
+
+        // geqrf: workspace query
+        cuSolverConnector::geqrf(cusolverH, m, n, d_A, lda, d_tau, nullptr, -1, d_info);
+        // Note: cuSOLVER uses nullptr for query, result returned via lwork
+        // But we need to call it with real pointer to get lwork
+        T work_query;
+        cuSolverConnector::geqrf(cusolverH, m, n, d_A, lda, d_tau, &work_query, -1, d_info);
+
+        // In practice, we use helper function to get lwork
+        // Or use magma for better interface
+        // Let's assume we have a way to get lwork
+        // For now, do a dummy call to get it
+        size_t workspaceInBytes = 0;
+        cusolverErrcheck(cusolverDnXgeqrf_bufferSize(
+            cusolverH, m, n,
+            getCudaDataType<T>::type, d_A, lda,
+            getCudaDataType<T>::type, // for tau
+            CUDA_R_32F, // numerical precision
+            CUSOLVER_WORKSPACE_QUERY_USE_MAX, &workspaceInBytes));
+
+        lwork = static_cast<int>(workspaceInBytes / sizeof(T));
+
+        // Allocate workspace
+        T *d_work;
+        cudaErrcheck(cudaMalloc(&d_work, sizeof(T) * lwork));
+
+        // 3. Perform geqrf
+        cusolverErrcheck(cusolverDnXgeqrf(
+            cusolverH, m, n,
+            getCudaDataType<T>::type, d_A, lda,
+            d_tau,
+            getCudaDataType<T>::type,
+            d_work, lwork * sizeof(T),
+            d_info));
+
+        int info;
+        cudaErrcheck(cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+        if (info != 0) {
+            throw std::runtime_error("cuSOLVER geqrf failed with info = " + std::to_string(info));
+        }
+
+        // 4. Generate Q using orgqr
+        // Query workspace for orgqr
+        cusolverErrcheck(cusolverDnXorgqr_bufferSize(
+            cusolverH, m, n, k,
+            getCudaDataType<T>::type, d_A, lda,
+            getCudaDataType<T>::type, d_tau,
+            CUDA_R_32F,
+            CUSOLVER_WORKSPACE_QUERY_USE_MAX, &workspaceInBytes));
+
+        lwork = static_cast<int>(workspaceInBytes / sizeof(T));
+        cudaErrcheck(cudaRealloc(&d_work, sizeof(T) * lwork)); // or realloc
+
+        // orgqr: generate Q
+        cusolverErrcheck(cusolverDnXorgqr(
+            cusolverH, m, n, k,
+            getCudaDataType<T>::type, d_A, lda,
+            getCudaDataType<T>::type, d_tau,
+            d_work, lwork * sizeof(T),
+            d_info));
+
+        cudaErrcheck(cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+        if (info != 0) {
+            throw std::runtime_error("cuSOLVER orgqr failed with info = " + std::to_string(info));
+        }
+
+        // Clean up
+        cudaErrcheck(cudaFree(d_tau));
+        cudaErrcheck(cudaFree(d_work));
+        cudaErrcheck(cudaFree(d_info));
+    }
+};
+
 // --- 2. Linear System Solvers ---
 template <typename T>
 struct lapack_getrs<T, DEVICE_GPU> {
diff --git a/source/source_base/module_container/ATen/kernels/lapack.cpp b/source/source_base/module_container/ATen/kernels/lapack.cpp
@@ -110,6 +110,64 @@ struct lapack_getri<T, DEVICE_CPU> {
     }
 };
 
+template <typename T>
+struct lapack_geqrf_inplace<T, DEVICE_CPU> {
+    void operator()(
+        const int m,
+        const int n,
+        T *A,
+        const int lda)
+    {
+        // Tensor or vector?
+        // 1. tau for storing the Householder reflectors
+        // tau should be dimension min(m, n)
+        int k = std::min(m, n);
+        Tensor tau(DataTypeToEnum<T>::value, DeviceType::CpuDevice, {k});
+        tau.zero();
+
+        int info = 0;
+
+        // 2. query for workspace size
+        int lwork = -1;
+        T work_query;
+        lapackConnector::geqrf(m, n, A, lda, tau.data<T>(), &work_query, lwork, info);
+        if (info != 0) {
+            throw std::runtime_error("geqrf workspace query failed with info = " + std::to_string(info));
+        }
+        // allocate workspace
+        lwork = static_cast<int>(get_real(work_query));
+        Tensor work(DataTypeToEnum<T>::value, DeviceType::CpuDevice, {lwork});
+        work.zero();
+
+        // 3. perform QR decomposition
+        // and A is overwritten with upper R.
+        // Lower A + tau => Q
+        lapackConnector::geqrf(m, n, A, lda, tau.data<T>(), work.data<T>(), lwork, info);
+        if (info != 0) {
+            throw std::runtime_error("geqrf failed with info = " + std::to_string(info));
+        }
+
+        // 4. use orgqr to compute Q
+        // workspace query
+        lwork = -1;
+        lapackConnector::orgqr(m, n, k, A, lda, tau.data<T>(), &work_query, lwork, info);
+        if (info != 0) {
+            throw std::runtime_error("orgqr workspace query failed with info = " + std::to_string(info));
+        }
+        // allocate workspace
+        lwork = static_cast<int>(get_real(work_query));
+        work.resize({lwork});
+
+        // compute Q
+        lapackConnector::orgqr(m, n, k, A, lda, tau.data<T>(), work.data<T>(), lwork, info);
+        if (info != 0) {
+            throw std::runtime_error("orgqr failed with info = " + std::to_string(info));
+        }
+
+        // now, A should be overwritten with Q, columns orthogonal
+
+    }
+};
 
 // --- 2. Linear System Solvers ---
 template <typename T>
diff --git a/source/source_base/module_container/ATen/kernels/lapack.h b/source/source_base/module_container/ATen/kernels/lapack.h
@@ -63,6 +63,41 @@ struct lapack_getri {
         const int& lwork);
 };
 
+// This is QR factorization in-place
+// that will change input Mat A to orthogonal/unitary matrix Q
+template <typename T, typename Device>
+struct lapack_geqrf_inplace {
+    void operator()(
+        const int m,
+        const int n,
+        T *A,
+        const int lda);
+};
+
+// This is QR factorization
+// where [in]Mat will be kept and the results are stored in separate matrix Q
+// template <typename T, typename Device>
+// struct lapack_geqrf{
+//     /**
+//      * Perform QR factorization of a matrix using LAPACK's geqrf function.
+//      *
+//      * @param m The number of rows in the matrix.
+//      * @param n The number of columns in the matrix.
+//      * @param Mat The matrix to be factorized.
+//      *        On exit, the upper triangle contains the upper triangular matrix R,
+//      *        and the elements below the diagonal, with the array TAU, represent
+//      *        the unitary matrix Q as a product of min(m,n) elementary reflectors.
+//      * @param lda The leading dimension of the matrix.
+//      * @param tau Array of size min(m,n) containing the Householder reflectors.
+//      */
+//     void operator()(
+//         const int m,
+//         const int n,
+//         T *Mat,
+//         const int lda,
+//         T *tau);
+// };
+
 
 // --- 2. Linear System Solvers ---
 template <typename T, typename Device>
diff --git a/source/source_base/module_container/base/third_party/cusolver.h b/source/source_base/module_container/base/third_party/cusolver.h
@@ -1136,6 +1136,34 @@ void getrs(cusolverDnHandle_t& cusolver_handle, const char& trans, const int& n,
     cudaErrcheck(cudaFree(d_info));
 }
 
+// QR decomposition
+// geqrf, orgqr
+// Note:
+// there are two cusolver geqrf
+// one is cusolverDn<t>geqrf
+// one is cusolverDnXgeqrf
+// which one is better?
+static inline
+void geqrf(cusolverDnHandle_t& cusolver_handle, const int m, const int n, std::complex<float>* A, const int lda, std::complex<float>* tau)
+{
+    // first allocate memory for workspace
+    int lwork = 0;
+    cusolverErrcheck(cusolverDnCgeqrf_bufferSize(cusolver_handle, m, n, reinterpret_cast<cuComplex*>(A), lda, &lwork));
+
+    std::complex<float>* d_work = nullptr;
+    cudaErrcheck(cudaMalloc((void**)&d_work, lwork * sizeof(std::complex<float>)));
+
+    // compute QR decomposition
+    cusolverErrcheck(cusolverDnCgeqrf(cusolver_handle, m, n, reinterpret_cast<cuComplex*>(A), lda, reinterpret_cast<cuComplex*>(tau), d_work, lwork, d_info));
+
+    cudaErrcheck(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if (h_info != 0) {
+        throw std::runtime_error("geqrf: failed to compute QR decomposition");
+    }
+
+    cudaErrcheck(cudaFree(d_work));
+}
+
 } // namespace cuSolverConnector
 } // namespace container
 
diff --git a/source/source_base/module_container/base/third_party/lapack.h b/source/source_base/module_container/base/third_party/lapack.h
@@ -181,6 +181,7 @@ void dgetri_(const int* n, double* A, const int* lda, const int* ipiv, double* w
 void cgetri_(const int* n, std::complex<float>* A, const int* lda, const int* ipiv, std::complex<float>* work, const int* lwork, int* info);
 void zgetri_(const int* n, std::complex<double>* A, const int* lda, const int* ipiv, std::complex<double>* work, const int* lwork, int* info);
 
+<<<<<<< Updated upstream
 // Solve linear system using LU factorization
 void sgetrs_(const char* trans, const int* n, const int* nrhs,
              const float* A, const int* lda, const int* ipiv,
@@ -194,6 +195,23 @@ void cgetrs_(const char* trans, const int* n, const int* nrhs,
 void zgetrs_(const char* trans, const int* n, const int* nrhs,
              const std::complex<double>* A, const int* lda, const int* ipiv,
              std::complex<double>* B, const int* ldb, int* info);
+=======
+void sgetrs_(const char* trans, const int* n, const int* nrhs, const float* A, const int* lda, const int* ipiv, float* B, const int* ldb, int* info);
+void dgetrs_(const char* trans, const int* n, const int* nrhs, const double* A, const int* lda, const int* ipiv, double* B, const int* ldb, int* info);
+void cgetrs_(const char* trans, const int* n, const int* nrhs, const std::complex<float>* A, const int* lda, const int* ipiv, std::complex<float>* B, const int* ldb, int* info);
+void zgetrs_(const char* trans, const int* n, const int* nrhs, const std::complex<double>* A, const int* lda, const int* ipiv, std::complex<double>* B, const int* ldb, int* info);
+
+void sgeqrf_(const int* m, const int* n, float* A, const int* lda, float* tau, float *work, const int* lwork, int* info);
+void dgeqrf_(const int* m, const int* n, double* A, const int* lda, double* tau, double *work, const int* lwork, int* info);
+void cgeqrf_(const int* m, const int* n, std::complex<float>* A, const int* lda, std::complex<float>* tau, std::complex<float> *work, const int* lwork, int* info);
+void zgeqrf_(const int* m, const int* n, std::complex<double>* A, const int* lda, std::complex<double>* tau, std::complex<double> *work, const int* lwork, int* info);
+
+void sorgqr_(const int* m, const int* n, const int* k, float* A, const int* lda, const float* tau, float* work, const int* lwork, int* info);
+void dorgqr_(const int* m, const int* n, const int* k, double* A, const int* lda, const double* tau, double* work, const int* lwork, int* info);
+void cungqr_(const int* m, const int* n, const int* k, std::complex<float>* A, const int* lda, const std::complex<float>* tau, std::complex<float> *work, const int* lwork, int* info);
+void zunqrf_(const int* m, const int* n, const int* k, std::complex<double>* A, const int* lda, const std::complex<double>* tau, std::complex<double> *work, const int* lwork, int* info);
+
+>>>>>>> Stashed changes
 }
 
 // Class LapackConnector provide the connector to fortran lapack routine.
@@ -535,6 +553,49 @@ void getrs(const char& trans, const int n, const int nrhs, std::complex<double>*
     zgetrs_(&trans, &n, &nrhs, A, &lda, ipiv, B, &ldb, &info);
 }
 
+// LAPACK routines for QR decomposition
+static inline
+void geqrf(const int m, const int n, float* A, const int lda, float* tau, float* work, const int lwork, int& info)
+{
+    sgeqrf_(&m, &n, A, &lda, tau, work, &lwork, &info);
+}
+static inline
+void geqrf(const int m, const int n, double* A, const int lda, double* tau, double* work, const int lwork, int& info)
+{
+    dgeqrf_(&m, &n, A, &lda, tau, work, &lwork, &info);
+}
+static inline
+void geqrf(const int m, const int n, std::complex<float>* A, const int lda, std::complex<float>* tau, std::complex<float>* work, const int lwork, int& info)
+{
+    cgeqrf_(&m, &n, A, &lda, tau, work, &lwork, &info);
+}
+static inline
+void geqrf(const int m, const int n, std::complex<double>* A, const int lda, std::complex<double>* tau, std::complex<double>* work, const int lwork, int& info)
+{
+    zgeqrf_(&m, &n, A, &lda, tau, work, &lwork, &info);
+}
+// these routines generate the orthogonal matrix Q from the QR decomposition
+static inline
+void orgqr(const int m, const int n, const int k, float* A, const int lda, const float* tau, float* work, const int lwork, int& info)
+{
+    sorgqr_(&m, &n, &k, A, &lda, tau, work, &lwork, &info);
+}
+static inline
+void orgqr(const int m, const int n, const int k, double* A, const int lda, const double* tau, double* work, const int lwork, int& info)
+{
+    dorgqr_(&m, &n, &k, A, &lda, tau, work, &lwork, &info);
+}
+static inline
+void orgqr(const int m, const int n, const int k, std::complex<float>* A, const int lda, const std::complex<float>* tau, std::complex<float>* work, const int lwork, int& info)
+{
+    cungqr_(&m, &n, &k, A, &lda, tau, work, &lwork, &info);
+}
+static inline
+void orgqr(const int m, const int n, const int k, std::complex<double>* A, const int lda, const std::complex<double>* tau, std::complex<double>* work, const int lwork, int& info)
+{
+    zunqrf_(&m, &n, &k, A, &lda, tau, work, &lwork, &info);
+}
+
 } // namespace lapackConnector
 } // namespace container