eckit::linalg::dense::LinearAlgebraTorch, eckit::linalg::sparse::LinearAlgebraTorch (1) single place for backend device name logic (detail::Torch), (2) const device/scalar type at construction, (3) limit MPS device (Apple) to dense functionality and single precision (current status)

pmaciel · pmaciel · commit a566ae143efa · 2026-03-03T10:16:09.000Z
diff --git a/src/eckit/linalg/dense/LinearAlgebraTorch.cc b/src/eckit/linalg/dense/LinearAlgebraTorch.cc
@@ -12,40 +12,33 @@
 #include "eckit/linalg/dense/LinearAlgebraTorch.h"
 
 #include <cstring>
-#include <ostream>
 
 #include "eckit/exception/Exceptions.h"
 #include "eckit/linalg/Matrix.h"
 #include "eckit/linalg/Vector.h"
-
 #include "eckit/linalg/detail/Torch.h"
 
 
 namespace eckit::linalg::dense {
 
 
-static const LinearAlgebraTorch LA_TORCH_CPU_1("torch");
-static const LinearAlgebraTorch LA_TORCH_CPU_2("torch-cpu");
-static const LinearAlgebraTorch LA_TORCH_CUDA("torch-cuda");
-static const LinearAlgebraTorch LA_TORCH_HIP("torch-hip");
-static const LinearAlgebraTorch LA_TORCH_MPS("torch-mps");
-static const LinearAlgebraTorch LA_TORCH_XPU("torch-xpu");
-static const LinearAlgebraTorch LA_TORCH_XLA("torch-xla");
-static const LinearAlgebraTorch LA_TORCH_META("torch-meta");
-
-
-using detail::get_torch_device;
-using detail::make_torch_dense_tensor;
-using detail::torch_tensor_transpose;
+static const LinearAlgebraTorch LA_TORCH_CPU_1("torch", torch::DeviceType::CPU);
+static const LinearAlgebraTorch LA_TORCH_CPU_2("torch-cpu", torch::DeviceType::CPU);
+static const LinearAlgebraTorch LA_TORCH_CUDA("torch-cuda", torch::DeviceType::CUDA);
+static const LinearAlgebraTorch LA_TORCH_HIP("torch-hip", torch::DeviceType::HIP);
+static const LinearAlgebraTorch LA_TORCH_MPS("torch-mps", torch::DeviceType::MPS, torch::kFloat32);
+static const LinearAlgebraTorch LA_TORCH_XPU("torch-xpu", torch::DeviceType::XPU);
+static const LinearAlgebraTorch LA_TORCH_XLA("torch-xla", torch::DeviceType::XLA);
+static const LinearAlgebraTorch LA_TORCH_META("torch-meta", torch::DeviceType::Meta);
 
 
 Scalar LinearAlgebraTorch::dot(const Vector& x, const Vector& y) const {
     ASSERT(x.size() == y.size());
 
-    auto x_tensor = make_torch_dense_tensor(x, get_torch_device(name()));
-    auto y_tensor = make_torch_dense_tensor(y, get_torch_device(name()));
+    auto x_tensor = make_dense_tensor(x);
+    auto y_tensor = make_dense_tensor(y);
 
-    return torch::dot(x_tensor, y_tensor).to(torch::kCPU).item<Scalar>();
+    return torch::dot(x_tensor, y_tensor).to(torch::DeviceType::CPU, torch::kFloat64).item<Scalar>();
 }
 
 
@@ -54,9 +47,9 @@ void LinearAlgebraTorch::gemv(const Matrix& A, const Vector& x, Vector& y) const
     ASSERT(A.rows() == y.rows());
 
     // multiplication
-    auto A_tensor = make_torch_dense_tensor(A, get_torch_device(name()));
-    auto x_tensor = make_torch_dense_tensor(x, get_torch_device(name()));
-    auto y_tensor = torch::matmul(A_tensor, x_tensor).to(torch::kCPU).contiguous();
+    auto A_tensor = make_dense_tensor(A);
+    auto x_tensor = make_dense_tensor(x);
+    auto y_tensor = tensor_to_host(torch::matmul(A_tensor, x_tensor));
 
     // assignment
     std::memcpy(y.data(), y_tensor.data_ptr<Scalar>(), y.rows() * sizeof(Scalar));
@@ -69,18 +62,13 @@ void LinearAlgebraTorch::gemm(const Matrix& A, const Matrix& X, Matrix& Y) const
     ASSERT(X.cols() == Y.cols());
 
     // multiplication and conversion from column-major to row-major (and back)
-    auto A_tensor = make_torch_dense_tensor(A, get_torch_device(name()));
-    auto X_tensor = make_torch_dense_tensor(X, get_torch_device(name()));
-    auto Y_tensor = torch_tensor_transpose(torch::matmul(A_tensor, X_tensor)).to(torch::kCPU).contiguous();
+    auto A_tensor = make_dense_tensor(A);
+    auto X_tensor = make_dense_tensor(X);
+    auto Y_tensor = tensor_transpose(tensor_to_host(torch::matmul(A_tensor, X_tensor)));
 
     // assignment
     std::memcpy(Y.data(), Y_tensor.data_ptr<Scalar>(), Y.size() * sizeof(Scalar));
 }
 
 
-void LinearAlgebraTorch::print(std::ostream& out) const {
-    out << "LinearAlgebraTorch[]";
-}
-
-
 }  // namespace eckit::linalg::dense
diff --git a/src/eckit/linalg/dense/LinearAlgebraTorch.h b/src/eckit/linalg/dense/LinearAlgebraTorch.h
@@ -12,18 +12,21 @@
 #pragma once
 
 #include "eckit/linalg/LinearAlgebraDense.h"
+#include "eckit/linalg/detail/Torch.h"
 
 
 namespace eckit::linalg::dense {
 
-struct LinearAlgebraTorch final : public LinearAlgebraDense {
-    LinearAlgebraTorch() = default;
-    LinearAlgebraTorch(const std::string& name) : LinearAlgebraDense(name) {}
+
+struct LinearAlgebraTorch final : public LinearAlgebraDense, detail::Torch {
+    LinearAlgebraTorch(const std::string& name, torch::DeviceType device, torch::ScalarType scalar = torch::kFloat64) :
+        LinearAlgebraDense(name), Torch(device, scalar) {}
 
     Scalar dot(const Vector& x, const Vector& y) const override;
     void gemv(const Matrix& A, const Vector& x, Vector& y) const override;
     void gemm(const Matrix& A, const Matrix& X, Matrix& Y) const override;
-    void print(std::ostream&) const override;
+    void print(std::ostream& os) const override { Torch::print(os); }
 };
 
+
 }  // namespace eckit::linalg::dense
diff --git a/src/eckit/linalg/detail/Torch.cc b/src/eckit/linalg/detail/Torch.cc
@@ -11,10 +11,9 @@
 
 #include "eckit/linalg/detail/Torch.h"
 
-#include <map>
+#include <ostream>
 #include <type_traits>
 
-#include "eckit/exception/Exceptions.h"
 #include "eckit/linalg/Matrix.h"
 #include "eckit/linalg/SparseMatrix.h"
 #include "eckit/linalg/Vector.h"
@@ -27,66 +26,48 @@ static_assert(std::is_same<int32_t, Index>::value, "Index type mismatch");
 static_assert(std::is_same<double, Scalar>::value, "Scalar type mismatch");
 
 
-torch::DeviceType get_torch_device(const std::string& name) {
-    static const auto device = [&name]() {
-        const std::map<std::string, torch::DeviceType> types{
-            {"cpu", torch::DeviceType::CPU},    //
-            {"cuda", torch::DeviceType::CUDA},  //
-            {"hip", torch::DeviceType::HIP},    //
-            {"mps", torch::DeviceType::MPS},    //
-            {"xpu", torch::DeviceType::XPU},    //
-            {"xla", torch::DeviceType::XLA},    //
-            {"meta", torch::DeviceType::Meta},  //
-        };
-
-        const auto sep = name.find_first_of('-');
-        if (sep == std::string::npos) {
-            return torch::DeviceType::CPU;
-        }
-
-        if (auto it = types.find(name.substr(sep + 1)); it != types.end()) {
-            return it->second;
-        }
-
-        throw eckit::UserError("Unknown torch device: " + name);
-    }();
-
-    return device;
+torch::Tensor Torch::tensor_transpose(const torch::Tensor& tensor) const {
+    return tensor.transpose(0, 1).contiguous();
 }
 
 
-torch::Tensor torch_tensor_transpose(const torch::Tensor& tensor) {
-    return tensor.transpose(0, 1).contiguous();
+torch::Tensor Torch::tensor_to_host(const torch::Tensor& tensor) const {
+    return tensor.to(torch::DeviceType::CPU, torch::kFloat64).contiguous();  // reverse MPS float32 (if applicable)
 }
 
 
-torch::Tensor make_torch_dense_tensor(const Matrix& A, torch::DeviceType device) {
+torch::Tensor Torch::make_dense_tensor(const Matrix& A) const {
     auto Ni = static_cast<int64_t>(A.cols());
     auto Nj = static_cast<int64_t>(A.rows());
 
-    return torch_tensor_transpose(
-        torch::from_blob(const_cast<Scalar*>(A.data()), {Ni, Nj}, torch::kFloat64).to(device));
+    return tensor_transpose(
+        torch::from_blob(const_cast<Scalar*>(A.data()), {Ni, Nj}, torch::kFloat64).to(device_, scalar_));
 }
 
 
-torch::Tensor make_torch_dense_tensor(const Vector& V, torch::DeviceType device) {
+torch::Tensor Torch::make_dense_tensor(const Vector& V) const {
     auto Ni = static_cast<int64_t>(V.size());
 
-    return torch::from_blob(const_cast<Scalar*>(V.data()), {Ni}, torch::kFloat64).to(device);
+    return torch::from_blob(const_cast<Scalar*>(V.data()), {Ni}, torch::kFloat64).to(device_, scalar_);
 }
 
 
-torch::Tensor make_torch_sparse_csr(const SparseMatrix& A, torch::DeviceType device) {
+torch::Tensor Torch::make_sparse_csr_tensor(const SparseMatrix& A) const {
     auto Ni = static_cast<int64_t>(A.rows());
     auto Nj = static_cast<int64_t>(A.cols());
     auto Nz = static_cast<int64_t>(A.nonZeros());
 
-    auto ia = torch::from_blob(const_cast<Index*>(A.outer()), {Ni + 1}, torch::kInt32).to(device, torch::kInt64);
-    auto ja = torch::from_blob(const_cast<Index*>(A.inner()), {Nz}, torch::kInt32).to(device, torch::kInt64);
-    auto a  = torch::from_blob(const_cast<Scalar*>(A.data()), {Nz}, torch::kFloat64).to(device);
+    auto ia = torch::from_blob(const_cast<Index*>(A.outer()), {Ni + 1}, torch::kInt32).to(device_, torch::kInt64);
+    auto ja = torch::from_blob(const_cast<Index*>(A.inner()), {Nz}, torch::kInt32).to(device_, torch::kInt64);
+    auto a  = torch::from_blob(const_cast<Scalar*>(A.data()), {Nz}, torch::kFloat64).to(device_, scalar_);
+
+    return torch::sparse_csr_tensor(ia, ja, a, {Ni, Nj},
+                                    torch::TensorOptions().dtype(scalar_).device(device_).layout(torch::kSparseCsr));
+}
+
 
-    return torch::sparse_csr_tensor(
-        ia, ja, a, {Ni, Nj}, torch::TensorOptions().dtype(torch::kFloat64).device(device).layout(torch::kSparseCsr));
+void Torch::print(std::ostream& os) const {
+    os << "LinearAlgebraTorch[device=" << device_ << "]";
 }
 
 
diff --git a/src/eckit/linalg/detail/Torch.h b/src/eckit/linalg/detail/Torch.h
@@ -11,7 +11,7 @@
 
 #pragma once
 
-#include <string>
+#include <iosfwd>
 
 #include "eckit/linalg/types.h"
 
@@ -21,11 +21,31 @@
 namespace eckit::linalg::detail {
 
 
-torch::DeviceType get_torch_device(const std::string&);
-torch::Tensor torch_tensor_transpose(const torch::Tensor&);
-torch::Tensor make_torch_dense_tensor(const Matrix&, torch::DeviceType);
-torch::Tensor make_torch_dense_tensor(const Vector&, torch::DeviceType);
-torch::Tensor make_torch_sparse_csr(const SparseMatrix&, torch::DeviceType);
+/**
+ * @brief Torch tensor creation and device management for linear algebra backends.
+ *
+ * Copies data host to/from device per operation. Transfer overhead may outweigh accelerator device gains for
+ * small/frequent operations; best suited for large matrices where compute dominates.
+ */
+class Torch {
+protected:
+
+    explicit Torch(torch::DeviceType device, torch::ScalarType scalar) : device_(device), scalar_(scalar) {}
+
+    torch::Tensor tensor_transpose(const torch::Tensor&) const;
+    torch::Tensor tensor_to_host(const torch::Tensor&) const;
+
+    torch::Tensor make_dense_tensor(const Matrix&) const;
+    torch::Tensor make_dense_tensor(const Vector&) const;
+    torch::Tensor make_sparse_csr_tensor(const SparseMatrix&) const;
+
+    void print(std::ostream&) const;
+
+private:
+
+    const torch::DeviceType device_;
+    const torch::ScalarType scalar_;
+};
 
 
 }  // namespace eckit::linalg::detail
diff --git a/src/eckit/linalg/sparse/LinearAlgebraTorch.cc b/src/eckit/linalg/sparse/LinearAlgebraTorch.cc
@@ -12,34 +12,26 @@
 #include "eckit/linalg/sparse/LinearAlgebraTorch.h"
 
 #include <cstring>
-#include <ostream>
 
 #include "eckit/exception/Exceptions.h"
 #include "eckit/linalg/Matrix.h"
 #include "eckit/linalg/SparseMatrix.h"
 #include "eckit/linalg/Vector.h"
-#include "eckit/linalg/sparse/LinearAlgebraGeneric.h"
-
 #include "eckit/linalg/detail/Torch.h"
+#include "eckit/linalg/sparse/LinearAlgebraGeneric.h"
 
 
 namespace eckit::linalg::sparse {
 
 
-static const LinearAlgebraTorch LA_TORCH_CPU_1("torch");
-static const LinearAlgebraTorch LA_TORCH_CPU_2("torch-cpu");
-static const LinearAlgebraTorch LA_TORCH_CUDA("torch-cuda");
-static const LinearAlgebraTorch LA_TORCH_HIP("torch-hip");
-static const LinearAlgebraTorch LA_TORCH_MPS("torch-mps");
-static const LinearAlgebraTorch LA_TORCH_XPU("torch-xpu");
-static const LinearAlgebraTorch LA_TORCH_XLA("torch-xla");
-static const LinearAlgebraTorch LA_TORCH_META("torch-meta");
-
-
-using detail::get_torch_device;
-using detail::make_torch_dense_tensor;
-using detail::make_torch_sparse_csr;
-using detail::torch_tensor_transpose;
+static const LinearAlgebraTorch LA_TORCH_CPU_1("torch", torch::DeviceType::CPU);
+static const LinearAlgebraTorch LA_TORCH_CPU_2("torch-cpu", torch::DeviceType::CPU);
+static const LinearAlgebraTorch LA_TORCH_CUDA("torch-cuda", torch::DeviceType::CUDA);
+static const LinearAlgebraTorch LA_TORCH_HIP("torch-hip", torch::DeviceType::HIP);
+// static const LinearAlgebraTorch LA_TORCH_MPS("torch-mps", torch::DeviceType::MPS);
+static const LinearAlgebraTorch LA_TORCH_XPU("torch-xpu", torch::DeviceType::XPU);
+static const LinearAlgebraTorch LA_TORCH_XLA("torch-xla", torch::DeviceType::XLA);
+static const LinearAlgebraTorch LA_TORCH_META("torch-meta", torch::DeviceType::Meta);
 
 
 void LinearAlgebraTorch::spmv(const SparseMatrix& A, const Vector& x, Vector& y) const {
@@ -48,17 +40,10 @@ void LinearAlgebraTorch::spmv(const SparseMatrix& A, const Vector& x, Vector& y)
     ASSERT(Ni == y.rows());
     ASSERT(Nj == x.rows());
 
-    // Note: This implementation copies data to GPU memory for each operation and immediately
-    // copies the result back to CPU. This data transfer overhead can be significant and may
-    // negate the performance benefits of GPU computation for small matrices or frequent operations.
-    // GPU acceleration is most beneficial for large matrices where computation time dominates
-    // transfer overhead. For optimal performance, consider keeping data on GPU across multiple
-    // operations rather than transferring for each call.
-
     // multiplication
-    auto A_tensor = make_torch_sparse_csr(A, get_torch_device(name()));
-    auto x_tensor = make_torch_dense_tensor(x, get_torch_device(name()));
-    auto y_tensor = torch::matmul(A_tensor, x_tensor).to(torch::kCPU).contiguous();
+    auto A_tensor = make_sparse_csr_tensor(A);
+    auto x_tensor = make_dense_tensor(x);
+    auto y_tensor = tensor_to_host(torch::matmul(A_tensor, x_tensor));
 
     // assignment
     std::memcpy(y.data(), y_tensor.data_ptr<Scalar>(), Ni * sizeof(Scalar));
@@ -73,17 +58,10 @@ void LinearAlgebraTorch::spmm(const SparseMatrix& A, const Matrix& X, Matrix& Y)
     ASSERT(Nj == X.rows());
     ASSERT(Nk == Y.cols());
 
-    // Note: This implementation copies data to GPU memory for each operation and immediately
-    // copies the result back to CPU. This data transfer overhead can be significant and may
-    // negate the performance benefits of GPU computation for small matrices or frequent operations.
-    // GPU acceleration is most beneficial for large matrices where computation time dominates
-    // transfer overhead. For optimal performance, consider keeping data on GPU across multiple
-    // operations rather than transferring for each call.
-
     // multiplication and conversion from column-major to row-major (and back)
-    auto A_tensor = make_torch_sparse_csr(A, get_torch_device(name()));
-    auto X_tensor = make_torch_dense_tensor(X, get_torch_device(name()));
-    auto Y_tensor = torch_tensor_transpose(torch::matmul(A_tensor, X_tensor)).to(torch::kCPU).contiguous();
+    auto A_tensor = make_sparse_csr_tensor(A);
+    auto X_tensor = make_dense_tensor(X);
+    auto Y_tensor = tensor_transpose(tensor_to_host(torch::matmul(A_tensor, X_tensor)));
 
     // assignment
     std::memcpy(Y.data(), Y_tensor.data_ptr<Scalar>(), Y.size() * sizeof(Scalar));
@@ -96,9 +74,4 @@ void LinearAlgebraTorch::dsptd(const Vector& x, const SparseMatrix& A, const Vec
 }
 
 
-void LinearAlgebraTorch::print(std::ostream& out) const {
-    out << "LinearAlgebraTorch[]";
-}
-
-
 }  // namespace eckit::linalg::sparse
diff --git a/src/eckit/linalg/sparse/LinearAlgebraTorch.h b/src/eckit/linalg/sparse/LinearAlgebraTorch.h
@@ -12,17 +12,21 @@
 #pragma once
 
 #include "eckit/linalg/LinearAlgebraSparse.h"
+#include "eckit/linalg/detail/Torch.h"
+
 
 namespace eckit::linalg::sparse {
 
-struct LinearAlgebraTorch final : public LinearAlgebraSparse {
-    LinearAlgebraTorch() = default;
-    LinearAlgebraTorch(const std::string& name) : LinearAlgebraSparse(name) {}
+
+struct LinearAlgebraTorch final : public LinearAlgebraSparse, detail::Torch {
+    LinearAlgebraTorch(const std::string& name, torch::DeviceType device, torch::ScalarType scalar = torch::kFloat64) :
+        LinearAlgebraSparse(name), Torch(device, scalar) {}
 
     void spmv(const SparseMatrix&, const Vector&, Vector&) const override;
     void spmm(const SparseMatrix&, const Matrix&, Matrix&) const override;
     void dsptd(const Vector&, const SparseMatrix&, const Vector&, SparseMatrix&) const override;
-    void print(std::ostream&) const override;
+    void print(std::ostream& os) const override { Torch::print(os); }
 };
 
+
 }  // namespace eckit::linalg::sparse