Skip to content

Commit bb6ea0e

Browse files
author
Daiki Adachi
committed
[fix] tensor_crs gpu blas
1 parent c22ae1a commit bb6ea0e

File tree

12 files changed

+176
-169
lines changed

12 files changed

+176
-169
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ Unreleased
5353
- Add view1D of tensor_Dense <https://gitlab.ritc.jp/ricos/monolish/-/merge_requests/496> <https://github.com/ricosjp/monolish/issues/728>
5454
- Add times/adds/axpy tests for view1D of matrix/tensor <https://gitlab.ritc.jp/ricos/monolish/-/merge_requests/498> <https://github.com/ricosjp/monolish/issues/729>
5555
- Add variadic templates for reshape tensor <https://gitlab.ritc.jp/ricos/monolish/-/merge_requests/499> <https://github.com/ricosjp/monolish/issues/730>
56+
- Add tensor_CRS <https://gitlab.ritc.jp/ricos/monolish/-/merge_requests/514> <https://github.com/ricosjp/monolish/issues/753>
5657

5758
### Changed
5859
- Start developing 0.17.1 <https://gitlab.ritc.jp/ricos/monolish/-/merge_requests/487>

include/monolish/common/monolish_crs.hpp

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -188,23 +188,6 @@ template <typename Float> class CRS {
188188
CRS(const size_t M, const size_t N, const std::vector<int> &rowptr,
189189
const std::vector<int> &colind, const vector<Float> &value);
190190

191-
/**
192-
* @brief Create CRS matrix from shared_ptr
193-
* @param M # of row
194-
* @param N # of col
195-
* @param rowptr row_ptr, which stores the starting points of the rows of the
196-
*arrays value and col_ind (size M+1)
197-
* @param colind col_ind, which stores the column numbers of the non-zero
198-
*elements (size nnz)
199-
* @param value value index, which stores the non-zero elements (size nnz)
200-
* @note
201-
* - # of computation: (M+1)+nnz + (M+1)+nnz (compute hash)
202-
* - Multi-threading: false
203-
* - GPU acceleration: true
204-
**/
205-
CRS(const size_t M, const size_t N, const std::vector<int> &rowptr,
206-
const std::vector<int> &colind, const std::shared_ptr<Float> &value);
207-
208191
/**
209192
* @brief Convert CRS matrix from COO matrix, also compute the hash
210193
* @param coo COO format matrix

include/monolish/common/monolish_dense.hpp

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -228,18 +228,6 @@ template <typename Float> class Dense {
228228
**/
229229
Dense(const size_t M, const size_t N, const vector<Float> &value);
230230

231-
/**
232-
* @brief Create construct dense matrix
233-
* @param M # of row
234-
* @param N # of col
235-
* @param value value
236-
* @note
237-
* - # of computation: 1
238-
* - Multi-threading: true
239-
* - GPU acceleration: false
240-
**/
241-
Dense(const size_t M, const size_t N, const std::shared_ptr<Float> &value);
242-
243231
/**
244232
* @brief Create dense matrix from std::initializer_list
245233
* @param M # of row

include/monolish/common/monolish_tensor_dense.hpp

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -181,18 +181,6 @@ template <typename Float> class tensor_Dense {
181181
tensor_Dense(const std::vector<size_t> &shape,
182182
const std::vector<Float> &value);
183183

184-
/**
185-
* @brief Allocate tensor_Dense tensor
186-
* @param shape shape of tensor
187-
* @param value value std::vector
188-
* @note
189-
* - # of computation: 1
190-
* - Multi-threading: false
191-
* - GPU acceleration: false
192-
*/
193-
tensor_Dense(const std::vector<size_t> &shape,
194-
const std::shared_ptr<Float> &value);
195-
196184
/**
197185
* @brief Allocate tensor_Dense tensor
198186
* @param shape shape of tensor

src/blas/tensor/tensaddsub/tensor_crs_tensaddsub.hpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@ namespace monolish {
55

66
namespace {
77

8-
template<typename F>
9-
void tensadd_core(const tensor::tensor_CRS<F> &A, const tensor::tensor_CRS<F> &B, tensor::tensor_CRS<F> &C) {
8+
template <typename F>
9+
void tensadd_core(const tensor::tensor_CRS<F> &A,
10+
const tensor::tensor_CRS<F> &B, tensor::tensor_CRS<F> &C) {
1011
Logger &logger = Logger::get_instance();
1112
logger.func_in(monolish_func);
1213

@@ -21,8 +22,9 @@ void tensadd_core(const tensor::tensor_CRS<F> &A, const tensor::tensor_CRS<F> &B
2122
logger.func_out();
2223
}
2324

24-
template<typename F>
25-
void tenssub_core(const tensor::tensor_CRS<F> &A, const tensor::tensor_CRS<F> &B, tensor::tensor_CRS<F> &C) {
25+
template <typename F>
26+
void tenssub_core(const tensor::tensor_CRS<F> &A,
27+
const tensor::tensor_CRS<F> &B, tensor::tensor_CRS<F> &C) {
2628
Logger &logger = Logger::get_instance();
2729
logger.func_in(monolish_func);
2830

src/blas/tensor/tensmat/tensor_crs-dense_tensmat.hpp

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,26 @@ void tensor_CRS_Dense_Dtensmat_core(const double &a,
2828
size_t nsum = 0;
2929

3030
for (size_t d = 0; d < A.row_ptrs.size(); ++d) {
31-
matrix::CRS<double> Amat(row, col, A.row_ptrs[d], A.col_inds[d],
32-
A.get_val());
33-
Amat.set_first(A.get_offset() + nsum);
34-
nsum += A.col_inds[d].size();
35-
matrix::Dense<double> Cmat(row, B.get_col(), C.get_val());
36-
Cmat.set_first(C.get_offset() + d * row * B.get_col());
31+
std::vector<double> Aval(A.col_inds[d].size());
32+
matrix::CRS<double> Amat(row, col, A.row_ptrs[d], A.col_inds[d], Aval);
33+
std::vector<double> Cval(row * B.get_col());
34+
matrix::Dense<double> Cmat(row, B.get_col(), Cval);
35+
if (A.get_device_mem_stat()) {
36+
Amat.send();
37+
Cmat.send();
38+
}
39+
internal::vcopy(Aval.size(), A.begin() + nsum, Amat.begin(),
40+
A.get_device_mem_stat());
41+
internal::vcopy(Cval.size(), C.begin() + d * row * B.get_col(),
42+
Cmat.begin(), A.get_device_mem_stat());
3743
CRS_Dense_Dmatmul_core(a, Amat, B, b, Cmat);
44+
internal::vcopy(Cval.size(), Cmat.begin(),
45+
C.begin() + d * row * B.get_col(), A.get_device_mem_stat());
46+
if (A.get_device_mem_stat()) {
47+
Amat.recv();
48+
Cmat.recv();
49+
}
50+
nsum += A.col_inds[d].size();
3851
}
3952

4053
logger.func_out();
@@ -63,13 +76,26 @@ void tensor_CRS_Dense_Stensmat_core(const float &a,
6376
size_t nsum = 0;
6477

6578
for (size_t d = 0; d < A.row_ptrs.size(); ++d) {
66-
matrix::CRS<float> Amat(row, col, A.row_ptrs[d], A.col_inds[d],
67-
A.get_val());
68-
Amat.set_first(A.get_offset() + nsum);
69-
nsum += A.col_inds[d].size();
70-
matrix::Dense<float> Cmat(row, B.get_col(), C.get_val());
71-
Cmat.set_first(C.get_offset() + d * row * B.get_col());
79+
std::vector<float> Aval(A.col_inds[d].size());
80+
matrix::CRS<float> Amat(row, col, A.row_ptrs[d], A.col_inds[d], Aval);
81+
std::vector<float> Cval(row * B.get_col());
82+
matrix::Dense<float> Cmat(row, B.get_col(), Cval);
83+
if (A.get_device_mem_stat()) {
84+
Amat.send();
85+
Cmat.send();
86+
}
87+
internal::vcopy(Aval.size(), A.begin() + nsum, Amat.begin(),
88+
A.get_device_mem_stat());
89+
internal::vcopy(Cval.size(), C.begin() + d * row * B.get_col(),
90+
Cmat.begin(), A.get_device_mem_stat());
7291
CRS_Dense_Smatmul_core(a, Amat, B, b, Cmat);
92+
internal::vcopy(Cval.size(), Cmat.begin(),
93+
C.begin() + d * row * B.get_col(), A.get_device_mem_stat());
94+
if (A.get_device_mem_stat()) {
95+
Amat.recv();
96+
Cmat.recv();
97+
}
98+
nsum += A.col_inds[d].size();
7399
}
74100

75101
logger.func_out();

src/blas/tensor/tensmul/tensor_crs-tensor_dense_tensmul.hpp

Lines changed: 54 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,10 @@ namespace monolish {
77
namespace {
88
// double ///////////////////
99
template <typename TENS2, typename TENS3>
10-
void tensor_CRS_tensor_Dense_Dtensmul_core(const double &a, const tensor::tensor_CRS<double> &A,
11-
const TENS2 &B, const double &b,
12-
TENS3 &C){
10+
void tensor_CRS_tensor_Dense_Dtensmul_core(const double &a,
11+
const tensor::tensor_CRS<double> &A,
12+
const TENS2 &B, const double &b,
13+
TENS3 &C) {
1314
Logger &logger = Logger::get_instance();
1415
logger.func_in(monolish_func);
1516

@@ -21,41 +22,55 @@ void tensor_CRS_tensor_Dense_Dtensmul_core(const double &a, const tensor::tensor
2122

2223
assert(col == Bshape[0]);
2324
std::vector<size_t> ABshape;
24-
for(size_t i=0; i+1<Ashape.size(); ++i){
25+
for (size_t i = 0; i + 1 < Ashape.size(); ++i) {
2526
ABshape.push_back(Ashape[i]);
2627
}
27-
for(size_t i=1; i<Bshape.size(); ++i){
28+
for (size_t i = 1; i < Bshape.size(); ++i) {
2829
ABshape.push_back(Bshape[i]);
2930
}
3031
assert(ABshape == Cshape);
3132

3233
std::vector<size_t> ABshape_tmp = Bshape;
3334
ABshape_tmp[0] = row;
3435
size_t ABshape_dim = 1;
35-
for(size_t i=0; i<ABshape_tmp.size(); ++i){
36+
for (size_t i = 0; i < ABshape_tmp.size(); ++i) {
3637
ABshape_dim *= ABshape_tmp[i];
3738
}
3839

3940
size_t nsum = 0;
4041

41-
for(size_t d=0; d<A.row_ptrs.size(); ++d){
42-
matrix::CRS<double> Amat(row, col, A.row_ptrs[d], A.col_inds[d], A.get_val());
43-
Amat.set_first(A.get_offset() + nsum);
44-
nsum += A.col_inds[d].size();
45-
tensor::tensor_Dense<double> Cmat(ABshape_tmp, C.get_val());
46-
Cmat.set_first(C.get_offset() + d * ABshape_dim);
42+
for (size_t d = 0; d < A.row_ptrs.size(); ++d) {
43+
std::vector<double> Aval(A.col_inds[d].size());
44+
matrix::CRS<double> Amat(row, col, A.row_ptrs[d], A.col_inds[d], Aval);
45+
std::vector<double> Cval(ABshape_dim);
46+
tensor::tensor_Dense<double> Cmat(ABshape_tmp, Cval);
47+
if (A.get_device_mem_stat()) {
48+
Amat.send();
49+
Cmat.send();
50+
}
51+
internal::vcopy(Aval.size(), A.begin() + nsum, Amat.begin(),
52+
A.get_device_mem_stat());
53+
internal::vcopy(Cval.size(), C.begin() + d * ABshape_dim, Cmat.begin(),
54+
A.get_device_mem_stat());
4755
CRS_tensor_Dense_Dmattens_core(a, Amat, B, b, Cmat);
56+
internal::vcopy(Cval.size(), Cmat.begin(), C.begin() + d * ABshape_dim,
57+
A.get_device_mem_stat());
58+
if (A.get_device_mem_stat()) {
59+
Amat.recv();
60+
Cmat.recv();
61+
}
62+
nsum += A.col_inds[d].size();
4863
}
4964

5065
logger.func_out();
51-
5266
}
5367

5468
// float ///////////////////
5569
template <typename TENS2, typename TENS3>
56-
void tensor_CRS_tensor_Dense_Stensmul_core(const float &a, const tensor::tensor_CRS<float> &A,
57-
const TENS2 &B, const float &b,
58-
TENS3 &C){
70+
void tensor_CRS_tensor_Dense_Stensmul_core(const float &a,
71+
const tensor::tensor_CRS<float> &A,
72+
const TENS2 &B, const float &b,
73+
TENS3 &C) {
5974
Logger &logger = Logger::get_instance();
6075
logger.func_in(monolish_func);
6176

@@ -67,30 +82,44 @@ void tensor_CRS_tensor_Dense_Stensmul_core(const float &a, const tensor::tensor_
6782

6883
assert(col == Bshape[0]);
6984
std::vector<size_t> ABshape;
70-
for(size_t i=0; i+1<Ashape.size(); ++i){
85+
for (size_t i = 0; i + 1 < Ashape.size(); ++i) {
7186
ABshape.push_back(Ashape[i]);
7287
}
73-
for(size_t i=1; i<Bshape.size(); ++i){
88+
for (size_t i = 1; i < Bshape.size(); ++i) {
7489
ABshape.push_back(Bshape[i]);
7590
}
7691
assert(ABshape == Cshape);
7792

7893
std::vector<size_t> ABshape_tmp = Bshape;
7994
ABshape_tmp[0] = row;
8095
size_t ABshape_dim = 1;
81-
for(size_t i=0; i<ABshape_tmp.size(); ++i){
96+
for (size_t i = 0; i < ABshape_tmp.size(); ++i) {
8297
ABshape_dim *= ABshape_tmp[i];
8398
}
8499

85100
size_t nsum = 0;
86101

87-
for(size_t d=0; d<A.row_ptrs.size(); ++d){
88-
matrix::CRS<float> Amat(row, col, A.row_ptrs[d], A.col_inds[d], A.get_val());
89-
Amat.set_first(A.get_offset() + nsum);
90-
nsum += A.col_inds[d].size();
91-
tensor::tensor_Dense<float> Cmat(ABshape_tmp, C.get_val());
92-
Cmat.set_first(C.get_offset() + d * ABshape_dim);
102+
for (size_t d = 0; d < A.row_ptrs.size(); ++d) {
103+
std::vector<float> Aval(A.col_inds[d].size());
104+
matrix::CRS<float> Amat(row, col, A.row_ptrs[d], A.col_inds[d], Aval);
105+
std::vector<float> Cval(ABshape_dim);
106+
tensor::tensor_Dense<float> Cmat(ABshape_tmp, Cval);
107+
if (A.get_device_mem_stat()) {
108+
Amat.send();
109+
Cmat.send();
110+
}
111+
internal::vcopy(Aval.size(), A.begin() + nsum, Amat.begin(),
112+
A.get_device_mem_stat());
113+
internal::vcopy(Cval.size(), C.begin() + d * ABshape_dim, Cmat.begin(),
114+
A.get_device_mem_stat());
93115
CRS_tensor_Dense_Smattens_core(a, Amat, B, b, Cmat);
116+
internal::vcopy(Cval.size(), Cmat.begin(), C.begin() + d * ABshape_dim,
117+
A.get_device_mem_stat());
118+
if (A.get_device_mem_stat()) {
119+
Amat.recv();
120+
Cmat.recv();
121+
}
122+
nsum += A.col_inds[d].size();
94123
}
95124

96125
logger.func_out();

src/blas/tensor/tensvec/tensor_crs_tensvec.hpp

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#pragma once
2+
#include "../../../internal/monolish_internal.hpp"
23
#include "../../matrix/matvec/crs_matvec.hpp"
34
#include "../../matrix/matvec/dense_matvec.hpp"
45

@@ -23,20 +24,26 @@ void Dtensvec_core(const double &a, const tensor::tensor_CRS<double> &A,
2324

2425
int nsum = 0;
2526
for (size_t d = 0; d < A.row_ptrs.size(); ++d) {
26-
std::vector<double> tmp(A.col_inds[d].size() + 1);
27-
for (size_t i = 0; i < A.col_inds[d].size(); ++i) {
28-
tmp[i] = A.begin()[i + nsum];
29-
}
30-
matrix::CRS<double> Amat(row, col, A.row_ptrs[d], A.col_inds[d], tmp);
31-
monolish::vector<double> Cvec(row);
32-
for (size_t i = 0; i < row; ++i) {
33-
Cvec.begin()[i] = C.begin()[d * row + i];
27+
std::vector<double> Aval(A.col_inds[d].size());
28+
matrix::CRS<double> Amat(row, col, A.row_ptrs[d], A.col_inds[d], Aval);
29+
std::vector<double> Cval(row);
30+
monolish::vector<double> Cvec(Cval);
31+
if (A.get_device_mem_stat()) {
32+
Amat.send();
33+
Cvec.send();
3434
}
35+
internal::vcopy(Aval.size(), A.begin() + nsum, Amat.begin(),
36+
A.get_device_mem_stat());
37+
internal::vcopy(Cval.size(), C.begin() + d * row, Cvec.begin(),
38+
A.get_device_mem_stat());
3539
Dmatvec_core(a, Amat, x, b, Cvec, transA);
36-
for (size_t i = 0; i < row; ++i) {
37-
C.begin()[d * row + i] = Cvec.begin()[i];
38-
}
3940
nsum += A.col_inds[d].size();
41+
internal::vcopy(Cval.size(), Cvec.begin(), C.begin() + d * row,
42+
A.get_device_mem_stat());
43+
if (A.get_device_mem_stat()) {
44+
Amat.recv();
45+
Cvec.recv();
46+
}
4047
}
4148

4249
logger.func_out();
@@ -61,20 +68,26 @@ void Stensvec_core(const float &a, const tensor::tensor_CRS<float> &A,
6168

6269
int nsum = 0;
6370
for (size_t d = 0; d < A.row_ptrs.size(); ++d) {
64-
std::vector<float> tmp(A.col_inds[d].size() + 1);
65-
for (size_t i = 0; i < A.col_inds[d].size(); ++i) {
66-
tmp[i] = A.begin()[i + nsum];
67-
}
68-
matrix::CRS<float> Amat(row, col, A.row_ptrs[d], A.col_inds[d], tmp);
69-
vector<float> Cvec(row);
70-
for (size_t i = 0; i < row; ++i) {
71-
Cvec.begin()[i] = C.begin()[d * row + i];
71+
std::vector<float> Aval(A.col_inds[d].size());
72+
matrix::CRS<float> Amat(row, col, A.row_ptrs[d], A.col_inds[d], Aval);
73+
std::vector<float> Cval(row);
74+
monolish::vector<float> Cvec(Cval);
75+
if (A.get_device_mem_stat()) {
76+
Amat.send();
77+
Cvec.send();
7278
}
79+
internal::vcopy(Aval.size(), A.begin() + nsum, Amat.begin(),
80+
A.get_device_mem_stat());
81+
internal::vcopy(Cval.size(), C.begin() + d * row, Cvec.begin(),
82+
A.get_device_mem_stat());
7383
Smatvec_core(a, Amat, x, b, Cvec, transA);
74-
for (size_t i = 0; i < row; ++i) {
75-
C.begin()[d * row + i] = Cvec.begin()[i];
76-
}
7784
nsum += A.col_inds[d].size();
85+
internal::vcopy(Cval.size(), Cvec.begin(), C.begin() + d * row,
86+
A.get_device_mem_stat());
87+
if (A.get_device_mem_stat()) {
88+
Amat.recv();
89+
Cvec.recv();
90+
}
7891
}
7992

8093
logger.func_out();

0 commit comments

Comments
 (0)