@@ -48,6 +48,13 @@ void LinearAlgebraTorch::spmv(const SparseMatrix& A, const Vector& x, Vector& y)
4848 ASSERT (Ni == y.rows ());
4949 ASSERT (Nj == x.rows ());
5050
51+ // Note: This implementation copies data to GPU memory for each operation and immediately
52+ // copies the result back to CPU. This data transfer overhead can be significant and may
53+ // negate the performance benefits of GPU computation for small matrices or frequent operations.
54+ // GPU acceleration is most beneficial for large matrices where computation time dominates
55+ // transfer overhead. For optimal performance, consider keeping data on GPU across multiple
56+ // operations rather than transferring for each call.
57+
5158 // multiplication
5259 auto A_tensor = make_torch_sparse_csr (A, get_torch_device (name ()));
5360 auto x_tensor = make_torch_dense_tensor (x, get_torch_device (name ()));
@@ -66,6 +73,13 @@ void LinearAlgebraTorch::spmm(const SparseMatrix& A, const Matrix& X, Matrix& Y)
6673 ASSERT (Nj == X.rows ());
6774 ASSERT (Nk == Y.cols ());
6875
76+ // Note: This implementation copies data to GPU memory for each operation and immediately
77+ // copies the result back to CPU. This data transfer overhead can be significant and may
78+ // negate the performance benefits of GPU computation for small matrices or frequent operations.
79+ // GPU acceleration is most beneficial for large matrices where computation time dominates
80+ // transfer overhead. For optimal performance, consider keeping data on GPU across multiple
81+ // operations rather than transferring for each call.
82+
6983 // multiplication and conversion from column-major to row-major (and back)
7084 auto A_tensor = make_torch_sparse_csr (A, get_torch_device (name ()));
7185 auto X_tensor = make_torch_dense_tensor (X, get_torch_device (name ()));
0 commit comments