2727
2828#include < cuda/details/sp_vector.hpp>
2929#include < cuda/details/meta.hpp>
30+ #include < cuda/kernels/bin_search.cuh>
3031#include < nsparse/matrix.h>
3132#include < nsparse/detail/meta.h>
3233#include < limits>
3334
3435namespace cubool {
3536 namespace kernels {
3637
37- template <typename IndexType, size_t threads, size_t blockSize>
38- __global__ void __spgemv_t (thrust::device_ptr<const IndexType> rowOffsets, // Input csr matrix rows
39- thrust::device_ptr<const IndexType> colIndices, // Input csr matrix col indices
40- thrust::device_ptr<IndexType> x, // Output dense x vector (x = M*v)
41- thrust::device_ptr<const IndexType> rowConfig, // Rows to process for each bin
42- IndexType rowsCount) { // Num of rows to process
43- // Split block into number of groups of size `threads`.
44- // Each group process its own row.
45-
46- IndexType id = threadIdx .x % threads; // id of the thread withing row processing group
47- IndexType interBlockId = threadIdx .x / threads; // id of the group (number of `threads` belong to the same group)
48- IndexType assignedOrder = blockIdx .x * (blockSize / threads) + interBlockId; // row, which is process by number of `threads`
49-
50- if (assignedOrder >= rowsCount)
51- assignedOrder = rowsCount - 1 ;
52-
53- IndexType i = rowConfig[assignedOrder]; // Row to process
54-
55- size_t rowSize = rowOffsets[i + 1 ] - rowOffsets[i];
56- size_t rowBegin = rowOffsets[i];
57-
58- // Add value to result
59- for (size_t k = id; k < rowSize; k += threads) {
60- x[colIndices[rowBegin + k]] = 0x1u ;
61- }
62- }
63-
6438 template <typename IndexType, typename AllocType>
6539 struct SpGEMVT {
6640 template <typename T>
6741 using ContainerType = thrust::device_vector<T, typename AllocType::template rebind<T>::other>;
6842 using MatrixType = nsparse::matrix<bool , IndexType, AllocType>;
6943 using VectorType = details::SpVector<IndexType, AllocType>;
7044
71- template <typename ... Bins>
72- void dispatch (StreamsWrapper<Config<Bins...>> &streamsWrapper,
73- thrust::device_ptr<const IndexType> rowOffsets, // Input csr matrix rows
74- thrust::device_ptr<const IndexType> colIndices, // Input csr matrix col indices
75- thrust::device_ptr<IndexType> x, // Output dense x vector (x = M*v)
76- const std::vector<IndexType> &binSizes, // Size of bin in rowConfig
77- const std::vector<IndexType> &binOffset, // Offset of bin in rowConfig
78- thrust::device_ptr<const IndexType> rowConfig) { // Rows to process for each bin)
79-
80- EXPAND_SIDE_EFFECTS (
81- (binSizes[Bins::id] > 0 ?
82- __spgemv_t <IndexType, Bins::threads, Bins::blockSize>
83- <<<binSizes[Bins::id] / Bins::dispatchRatio + (binSizes[Bins::id] % Bins::dispatchRatio ? 1 : 0 ), Bins::blockSize, 0 , streamsWrapper.streams[Bins::id]>>>
84- (rowOffsets, colIndices, x, rowConfig + binOffset[Bins::id], binSizes[Bins::id])
85- : void ())
86- );
87- }
88-
8945 /* *
9046 * Compute r = M^t x v
9147 *
92- * Matrix-vector multiplication algorithm:
93- * 1. Assign for each row its computation group (group defines number of threads used for processing)
94- * 2. Run each group (larger row - more threads must be assigned)
95- *
9648 * @param v Sparse vector
9749 * @param m Sparse matrix
9850 *
9951 * @return Sparse vector
10052 */
10153 VectorType operator ()(const VectorType &v, const MatrixType &m) {
102- static constexpr size_t max = std::numeric_limits<size_t >::max ();
103-
10454 auto N = m.m_cols ;
10555 auto vnvals = v.m_vals ;
10656
@@ -117,81 +67,38 @@ namespace cubool {
11767 // Empty out buffer
11868 thrust::fill_n (mOutput .begin (), N, (IndexType) 0 );
11969
120- using ConfigType = Config<Bin<4 , 32 , 1 , 8 , 0 >,
121- Bin<8 , 32 , 8 , 16 , 1 >,
122- Bin<16 , 32 , 16 , 32 , 2 >,
123- Bin<32 , 32 , 32 , 64 , 3 >,
124- Bin<64 , 64 , 64 , 128 , 4 >,
125- Bin<128 , 128 , 128 , 256 , 5 >,
126- Bin<256 , 256 , 256 , max, 6 >>;
127- ConfigType config;
128-
129- // Process only rows, where vec value is non-zero
130- mRowsConfig .resize (vnvals);
131-
132- mBinsSize .resize (config.binsCount ());
133- mBinsOffsets .resize (config.binsCount ());
134-
135- thrust::fill (mBinsSize .begin (), mBinsSize .end (), (IndexType) 0 );
136-
137- // Eval bins size for each row (look-up row by vector value)
138- thrust::for_each (v.m_rows_index .begin (), v.m_rows_index .end (),
139- [config, binSize = mBinsSize .data (), rowsIndices = m.m_row_index .data ()]
140- __device__ (IndexType i) {
141- auto valsInRow = rowsIndices[i + 1 ] - rowsIndices[i];
142- auto binId = config.selectBin (valsInRow);
143-
144- if (binId == config.unusedBinId ())
145- // Ignore empty rows
146- return ;
147-
148- atomicAdd ((binSize + binId).get (), 1 );
149- });
150-
151- // Offsets for each bin (each bin will have its own section in permutation buffer)
152- thrust::exclusive_scan (mBinsSize .begin (), mBinsSize .end (), mBinsOffsets .begin (), 0 , thrust::plus<IndexType>());
153-
154- // Reset bin sizes (use as offsets for permutation id assignments)
155- thrust::fill (mBinsSize .begin (), mBinsSize .end (), (IndexType) 0 );
156-
157- // Assign rows () for its bins
158- thrust::for_each (v.m_rows_index .begin (), v.m_rows_index .end (),
159- [config, binSize = mBinsSize .data (), binOffset = mBinsOffsets .data (), rowsConfig = mRowsConfig .data (),
160- rowsIndices = m.m_row_index .data ()]
161- __device__ (IndexType i) {
162- auto valsInRow = rowsIndices[i + 1 ] - rowsIndices[i];
163- auto binId = config.selectBin (valsInRow);
164-
165- if (binId == config.unusedBinId ())
166- // Ignore empty rows
167- return ;
168-
169- auto order = atomicAdd ((binSize + binId).get (), 1 );
170- rowsConfig[binOffset[binId] + order] = i;
171- });
70+ // Count number of rows of matrix m to process (based on v)
71+ ContainerType<IndexType> configTmp (vnvals + 1 );
72+ ContainerType<IndexType> config (vnvals + 1 );
17273
173- // Bring to the host bin sizes and offsets
174- std::vector<IndexType> binOffsets (mBinsOffsets .size ());
175- thrust::copy (mBinsOffsets .begin (), mBinsOffsets .end (), binOffsets.begin ());
74+ thrust::for_each (thrust::counting_iterator<IndexType>(0 ), thrust::counting_iterator<IndexType>(vnvals),
75+ [vIndices = v.m_rows_index .data (), mRowOffset = m.m_row_index .data (), config = configTmp.data ()]
76+ __device__ (IndexType id) {
77+ auto rowId = vIndices[id];
78+ auto valuesCount = mRowOffset [rowId + 1 ] - mRowOffset [rowId];
79+ config[id] = valuesCount;
80+ });
17681
177- std::vector<IndexType> binSizes ( mBinsSize . size ());
178- thrust::copy ( mBinsSize .begin (), mBinsSize .end (), binSizes .begin ());
82+ // Offset of each row min-group
83+ thrust::exclusive_scan (configTmp .begin (), configTmp .end (), config .begin (), 0 , thrust::plus<IndexType> ());
17984
180- // Stream for each bin
181- StreamsWrapper<ConfigType> streamsWrapper;
85+ IndexType totalToProcess = config.back ();
18286
183- dispatch (streamsWrapper,
184- m.m_row_index .data (),
185- m.m_col_index .data (),
186- mOutput .data (),
187- binSizes,
188- binOffsets,
189- mRowsConfig .data ());
87+ // For each value in selected rows run and compute non-zero output result values
88+ thrust::for_each (thrust::counting_iterator<IndexType>(0 ), thrust::counting_iterator<IndexType>(totalToProcess),
89+ [result = mOutput .data (), config = config.data (), rows = v.m_vals ,
90+ rowOffsets = m.m_row_index .data (), colIndex = m.m_col_index .data (), vIndices = v.m_rows_index .data ()]
91+ __device__ (IndexType id) {
92+ auto configRow = kernels::findNearestRowIdx<IndexType>(id, rows, config); // Find config slot to process
93+ auto valueInRow = id - config[configRow]; // Find value relative id in row to process
94+ auto rowId = vIndices[configRow]; // Find row to process
95+ auto valueIdx = colIndex[rowOffsets[rowId] + valueInRow]; // Get actual col index
19096
191- cudaDeviceSynchronize ();
97+ result[valueIdx] = 0x1u ; // This value in result vector is non-zero
98+ });
19299
193100 // Nnz of the result
194- auto resultSize = thrust::reduce (mOutput .begin (), mOutput .end () , (IndexType) 0 );
101+ auto resultSize = thrust::reduce (mOutput .begin (), mOutput .begin () + N , (IndexType) 0 );
195102
196103 ContainerType<index> result (resultSize);
197104
@@ -219,10 +126,7 @@ namespace cubool {
219126 }
220127
221128 private:
222- ContainerType<index> mRowsConfig ;
223- ContainerType<index> mBinsSize ;
224- ContainerType<index> mBinsOffsets ;
225- ContainerType<index> mOutput ;
129+ ContainerType<IndexType> mOutput ;
226130 };
227131
228132 }
0 commit comments