Skip to content

Commit a741180

Browse files
committed
[Code] Simplify cuda spgemv[_t] impls
1 parent 92e0575 commit a741180

File tree

2 files changed

+49
-145
lines changed

2 files changed

+49
-145
lines changed

cubool/sources/cuda/kernels/spgemv.cuh

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,28 @@
2727

2828
#include <cuda/details/sp_vector.hpp>
2929
#include <cuda/details/meta.hpp>
30+
#include <cuda/kernels/bin_search.cuh>
3031
#include <nsparse/matrix.h>
3132
#include <nsparse/detail/meta.h>
3233
#include <limits>
3334

3435
namespace cubool {
3536
namespace kernels {
3637

38+
template<typename IndexType>
39+
__device__ IndexType __get_vec_value(thrust::device_ptr<const IndexType> v, IndexType vSize, IndexType value) {
40+
auto begin = v;
41+
auto end = v + vSize;
42+
auto result = kernels::find(begin, end, value);
43+
44+
return result == end? 0x0u: 0x1;
45+
}
46+
3747
template<typename IndexType, size_t threads, size_t blockSize>
3848
__global__ void __spgemv(thrust::device_ptr<const IndexType> rowOffsets, // Input csr matrix rows
3949
thrust::device_ptr<const IndexType> colIndices, // Input csr matrix col indices
40-
thrust::device_ptr<const IndexType> v, // Input dense v vector
50+
thrust::device_ptr<const IndexType> v, // Input sparse v vector
51+
IndexType vSize, // Input sparse v vector size
4152
thrust::device_ptr<IndexType> x, // Output dense x vector (x = M*v)
4253
thrust::device_ptr<const IndexType> rowConfig, // Rows to process for each bin
4354
IndexType rowsCount) { // Num of rows to process
@@ -48,22 +59,22 @@ namespace cubool {
4859
IndexType interBlockId = threadIdx.x / threads; // id of the group (number of `threads` belong to the same group)
4960
IndexType assignedOrder = blockIdx.x * (blockSize / threads) + interBlockId; // row, which is process by number of `threads`
5061

62+
__shared__ IndexType tmp_accum[blockSize];
63+
5164
if (assignedOrder >= rowsCount)
5265
assignedOrder = rowsCount - 1;
5366

5467
IndexType i = rowConfig[assignedOrder]; // Row to process
5568

56-
size_t rowSize = rowOffsets[i + 1] - rowOffsets[i];
5769
size_t rowBegin = rowOffsets[i];
58-
59-
__shared__ IndexType tmp_accum[blockSize];
70+
size_t rowSize = rowOffsets[i + 1] - rowBegin;
6071

6172
// Initial zero
6273
tmp_accum[threadIdx.x] = 0;
6374

6475
// Each thread accum nnz values
6576
for (size_t k = id; k < rowSize; k += threads) {
66-
tmp_accum[threadIdx.x] |= v[colIndices[rowBegin + k]];
77+
tmp_accum[threadIdx.x] |= __get_vec_value<IndexType>(v, vSize, colIndices[rowBegin + k]);
6778
}
6879

6980
// Reduce accum to single value
@@ -104,7 +115,8 @@ namespace cubool {
104115
void dispatch(StreamsWrapper<Config<Bins...>> &streamsWrapper,
105116
thrust::device_ptr<const IndexType> rowOffsets, // Input csr matrix rows
106117
thrust::device_ptr<const IndexType> colIndices, // Input csr matrix col indices
107-
thrust::device_ptr<const IndexType> v, // Input dense v vector
118+
thrust::device_ptr<const IndexType> v, // Input sparse v vector
119+
IndexType vSize, // Input sparse v vector size
108120
thrust::device_ptr<IndexType> x, // Output dense x vector (x = M*v)
109121
const std::vector<IndexType> &binSizes, // Size of bin in rowConfig
110122
const std::vector<IndexType> &binOffset, // Offset of bin in rowConfig
@@ -114,7 +126,7 @@ namespace cubool {
114126
(binSizes[Bins::id] > 0 ?
115127
__spgemv<IndexType, Bins::threads, Bins::blockSize>
116128
<<<binSizes[Bins::id] / Bins::dispatchRatio + (binSizes[Bins::id] % Bins::dispatchRatio ? 1 : 0), Bins::blockSize, 0, streamsWrapper.streams[Bins::id]>>>
117-
(rowOffsets, colIndices, v, x, rowConfig + binOffset[Bins::id], binSizes[Bins::id])
129+
(rowOffsets, colIndices, v, vSize, x, rowConfig + binOffset[Bins::id], binSizes[Bins::id])
118130
: void())
119131
);
120132
}
@@ -143,22 +155,10 @@ namespace cubool {
143155
if (v.m_vals == 0 || m.m_vals == 0)
144156
return VectorType(M);
145157

146-
// Resize cached buffers to store v and r as dense vectors
147-
if (mInput.size() < N)
148-
mInput.resize(N);
149-
158+
// Resize cached buffers to store r as dense vector
150159
if (mOutput.size() < M)
151160
mOutput.resize(M);
152161

153-
// Copy v to dense vector
154-
thrust::fill_n(mInput.begin(), N, (IndexType) 0);
155-
thrust::for_each(v.m_rows_index.begin(), v.m_rows_index.end(),
156-
[input = mInput.data()]
157-
__device__(IndexType
158-
i) {
159-
input[i] = 1;
160-
});
161-
162162
// Empty out buffer
163163
thrust::fill_n(mOutput.begin(), M, (IndexType) 0);
164164

@@ -228,7 +228,8 @@ namespace cubool {
228228
dispatch(streamsWrapper,
229229
m.m_row_index.data(),
230230
m.m_col_index.data(),
231-
mInput.data(),
231+
v.m_rows_index.data(),
232+
v.m_vals,
232233
mOutput.data(),
233234
binSizes,
234235
binOffsets,
@@ -268,7 +269,6 @@ namespace cubool {
268269
ContainerType<index> mRowsConfig;
269270
ContainerType<index> mBinsSize;
270271
ContainerType<index> mBinsOffsets;
271-
ContainerType<index> mInput;
272272
ContainerType<index> mOutput;
273273
};
274274

cubool/sources/cuda/kernels/spgemv_t.cuh

Lines changed: 27 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -27,80 +27,30 @@
2727

2828
#include <cuda/details/sp_vector.hpp>
2929
#include <cuda/details/meta.hpp>
30+
#include <cuda/kernels/bin_search.cuh>
3031
#include <nsparse/matrix.h>
3132
#include <nsparse/detail/meta.h>
3233
#include <limits>
3334

3435
namespace cubool {
3536
namespace kernels {
3637

37-
template<typename IndexType, size_t threads, size_t blockSize>
38-
__global__ void __spgemv_t(thrust::device_ptr<const IndexType> rowOffsets, // Input csr matrix rows
39-
thrust::device_ptr<const IndexType> colIndices, // Input csr matrix col indices
40-
thrust::device_ptr<IndexType> x, // Output dense x vector (x = M*v)
41-
thrust::device_ptr<const IndexType> rowConfig, // Rows to process for each bin
42-
IndexType rowsCount) { // Num of rows to process
43-
// Split block into number of groups of size `threads`.
44-
// Each group process its own row.
45-
46-
IndexType id = threadIdx.x % threads; // id of the thread withing row processing group
47-
IndexType interBlockId = threadIdx.x / threads; // id of the group (number of `threads` belong to the same group)
48-
IndexType assignedOrder = blockIdx.x * (blockSize / threads) + interBlockId; // row, which is process by number of `threads`
49-
50-
if (assignedOrder >= rowsCount)
51-
assignedOrder = rowsCount - 1;
52-
53-
IndexType i = rowConfig[assignedOrder]; // Row to process
54-
55-
size_t rowSize = rowOffsets[i + 1] - rowOffsets[i];
56-
size_t rowBegin = rowOffsets[i];
57-
58-
// Add value to result
59-
for (size_t k = id; k < rowSize; k += threads) {
60-
x[colIndices[rowBegin + k]] = 0x1u;
61-
}
62-
}
63-
6438
template<typename IndexType, typename AllocType>
6539
struct SpGEMVT {
6640
template<typename T>
6741
using ContainerType = thrust::device_vector<T, typename AllocType::template rebind<T>::other>;
6842
using MatrixType = nsparse::matrix<bool, IndexType, AllocType>;
6943
using VectorType = details::SpVector<IndexType, AllocType>;
7044

71-
template<typename ... Bins>
72-
void dispatch(StreamsWrapper<Config<Bins...>> &streamsWrapper,
73-
thrust::device_ptr<const IndexType> rowOffsets, // Input csr matrix rows
74-
thrust::device_ptr<const IndexType> colIndices, // Input csr matrix col indices
75-
thrust::device_ptr<IndexType> x, // Output dense x vector (x = M*v)
76-
const std::vector<IndexType> &binSizes, // Size of bin in rowConfig
77-
const std::vector<IndexType> &binOffset, // Offset of bin in rowConfig
78-
thrust::device_ptr<const IndexType> rowConfig) { // Rows to process for each bin)
79-
80-
EXPAND_SIDE_EFFECTS(
81-
(binSizes[Bins::id] > 0 ?
82-
__spgemv_t<IndexType, Bins::threads, Bins::blockSize>
83-
<<<binSizes[Bins::id] / Bins::dispatchRatio + (binSizes[Bins::id] % Bins::dispatchRatio ? 1 : 0), Bins::blockSize, 0, streamsWrapper.streams[Bins::id]>>>
84-
(rowOffsets, colIndices, x, rowConfig + binOffset[Bins::id], binSizes[Bins::id])
85-
: void())
86-
);
87-
}
88-
8945
/**
9046
* Compute r = M^t x v
9147
*
92-
* Matrix-vector multiplication algorithm:
93-
* 1. Assign for each row its computation group (group defines number of threads used for processing)
94-
* 2. Run each group (larger row - more threads must be assigned)
95-
*
9648
* @param v Sparse vector
9749
* @param m Sparse matrix
9850
*
9951
* @return Sparse vector
10052
*/
10153
VectorType operator()(const VectorType &v, const MatrixType &m) {
102-
static constexpr size_t max = std::numeric_limits<size_t>::max();
103-
10454
auto N = m.m_cols;
10555
auto vnvals = v.m_vals;
10656

@@ -117,81 +67,38 @@ namespace cubool {
11767
// Empty out buffer
11868
thrust::fill_n(mOutput.begin(), N, (IndexType) 0);
11969

120-
using ConfigType = Config<Bin<4, 32, 1, 8, 0>,
121-
Bin<8, 32, 8, 16, 1>,
122-
Bin<16, 32, 16, 32, 2>,
123-
Bin<32, 32, 32, 64, 3>,
124-
Bin<64, 64, 64, 128, 4>,
125-
Bin<128, 128, 128, 256, 5>,
126-
Bin<256, 256, 256, max, 6>>;
127-
ConfigType config;
128-
129-
// Process only rows, where vec value is non-zero
130-
mRowsConfig.resize(vnvals);
131-
132-
mBinsSize.resize(config.binsCount());
133-
mBinsOffsets.resize(config.binsCount());
134-
135-
thrust::fill(mBinsSize.begin(), mBinsSize.end(), (IndexType) 0);
136-
137-
// Eval bins size for each row (look-up row by vector value)
138-
thrust::for_each(v.m_rows_index.begin(), v.m_rows_index.end(),
139-
[config, binSize = mBinsSize.data(), rowsIndices = m.m_row_index.data()]
140-
__device__(IndexType i) {
141-
auto valsInRow = rowsIndices[i + 1] - rowsIndices[i];
142-
auto binId = config.selectBin(valsInRow);
143-
144-
if (binId == config.unusedBinId())
145-
// Ignore empty rows
146-
return;
147-
148-
atomicAdd((binSize + binId).get(), 1);
149-
});
150-
151-
// Offsets for each bin (each bin will have its own section in permutation buffer)
152-
thrust::exclusive_scan(mBinsSize.begin(), mBinsSize.end(), mBinsOffsets.begin(), 0, thrust::plus<IndexType>());
153-
154-
// Reset bin sizes (use as offsets for permutation id assignments)
155-
thrust::fill(mBinsSize.begin(), mBinsSize.end(), (IndexType) 0);
156-
157-
// Assign rows () for its bins
158-
thrust::for_each(v.m_rows_index.begin(), v.m_rows_index.end(),
159-
[config, binSize = mBinsSize.data(), binOffset = mBinsOffsets.data(), rowsConfig = mRowsConfig.data(),
160-
rowsIndices = m.m_row_index.data()]
161-
__device__(IndexType i) {
162-
auto valsInRow = rowsIndices[i + 1] - rowsIndices[i];
163-
auto binId = config.selectBin(valsInRow);
164-
165-
if (binId == config.unusedBinId())
166-
// Ignore empty rows
167-
return;
168-
169-
auto order = atomicAdd((binSize + binId).get(), 1);
170-
rowsConfig[binOffset[binId] + order] = i;
171-
});
70+
// Count number of rows of matrix m to process (based on v)
71+
ContainerType<IndexType> configTmp(vnvals + 1);
72+
ContainerType<IndexType> config(vnvals + 1);
17273

173-
// Bring to the host bin sizes and offsets
174-
std::vector<IndexType> binOffsets(mBinsOffsets.size());
175-
thrust::copy(mBinsOffsets.begin(), mBinsOffsets.end(), binOffsets.begin());
74+
thrust::for_each(thrust::counting_iterator<IndexType>(0), thrust::counting_iterator<IndexType>(vnvals),
75+
[vIndices = v.m_rows_index.data(), mRowOffset = m.m_row_index.data(), config = configTmp.data()]
76+
__device__(IndexType id) {
77+
auto rowId = vIndices[id];
78+
auto valuesCount = mRowOffset[rowId + 1] - mRowOffset[rowId];
79+
config[id] = valuesCount;
80+
});
17681

177-
std::vector<IndexType> binSizes(mBinsSize.size());
178-
thrust::copy(mBinsSize.begin(), mBinsSize.end(), binSizes.begin());
82+
// Offset of each row min-group
83+
thrust::exclusive_scan(configTmp.begin(), configTmp.end(), config.begin(), 0, thrust::plus<IndexType>());
17984

180-
// Stream for each bin
181-
StreamsWrapper<ConfigType> streamsWrapper;
85+
IndexType totalToProcess = config.back();
18286

183-
dispatch(streamsWrapper,
184-
m.m_row_index.data(),
185-
m.m_col_index.data(),
186-
mOutput.data(),
187-
binSizes,
188-
binOffsets,
189-
mRowsConfig.data());
87+
// For each value in selected rows run and compute non-zero output result values
88+
thrust::for_each(thrust::counting_iterator<IndexType>(0), thrust::counting_iterator<IndexType>(totalToProcess),
89+
[result = mOutput.data(), config = config.data(), rows = v.m_vals,
90+
rowOffsets = m.m_row_index.data(), colIndex = m.m_col_index.data(), vIndices = v.m_rows_index.data()]
91+
__device__(IndexType id) {
92+
auto configRow = kernels::findNearestRowIdx<IndexType>(id, rows, config); // Find config slot to process
93+
auto valueInRow = id - config[configRow]; // Find value relative id in row to process
94+
auto rowId = vIndices[configRow]; // Find row to process
95+
auto valueIdx = colIndex[rowOffsets[rowId] + valueInRow]; // Get actual col index
19096

191-
cudaDeviceSynchronize();
97+
result[valueIdx] = 0x1u; // This value in result vector is non-zero
98+
});
19299

193100
// Nnz of the result
194-
auto resultSize = thrust::reduce(mOutput.begin(), mOutput.end(), (IndexType) 0);
101+
auto resultSize = thrust::reduce(mOutput.begin(), mOutput.begin() + N, (IndexType) 0);
195102

196103
ContainerType<index> result(resultSize);
197104

@@ -219,10 +126,7 @@ namespace cubool {
219126
}
220127

221128
private:
222-
ContainerType<index> mRowsConfig;
223-
ContainerType<index> mBinsSize;
224-
ContainerType<index> mBinsOffsets;
225-
ContainerType<index> mOutput;
129+
ContainerType<IndexType> mOutput;
226130
};
227131

228132
}

0 commit comments

Comments
 (0)