[Code] Simplify cuda spgemv[_t] impls

EgorOrachyov · EgorOrachyov · commit a741180290a2 · 2021-05-10T19:00:36.000+03:00
diff --git a/cubool/sources/cuda/kernels/spgemv.cuh b/cubool/sources/cuda/kernels/spgemv.cuh
@@ -27,17 +27,28 @@
 
 #include <cuda/details/sp_vector.hpp>
 #include <cuda/details/meta.hpp>
+#include <cuda/kernels/bin_search.cuh>
 #include <nsparse/matrix.h>
 #include <nsparse/detail/meta.h>
 #include <limits>
 
 namespace cubool {
     namespace kernels {
 
+        template<typename IndexType>
+        __device__ IndexType __get_vec_value(thrust::device_ptr<const IndexType> v, IndexType vSize, IndexType value) {
+            auto begin = v;
+            auto end = v + vSize;
+            auto result = kernels::find(begin, end, value);
+
+            return result == end? 0x0u: 0x1;
+        }
+
         template<typename IndexType, size_t threads, size_t blockSize>
         __global__ void __spgemv(thrust::device_ptr<const IndexType> rowOffsets,  // Input csr matrix rows
                                  thrust::device_ptr<const IndexType> colIndices,  // Input csr matrix col indices
-                                 thrust::device_ptr<const IndexType> v,           // Input dense v vector
+                                 thrust::device_ptr<const IndexType> v,           // Input sparse v vector
+                                 IndexType vSize,                                 // Input sparse v vector size
                                  thrust::device_ptr<IndexType> x,                 // Output dense x vector (x = M*v)
                                  thrust::device_ptr<const IndexType> rowConfig,   // Rows to process for each bin
                                  IndexType rowsCount) {                           // Num of rows to process
@@ -48,22 +59,22 @@ namespace cubool {
             IndexType interBlockId = threadIdx.x / threads;                                 // id of the group (number of `threads` belong to the same group)
             IndexType assignedOrder = blockIdx.x * (blockSize / threads) + interBlockId;    // row, which is process by number of `threads`
 
+            __shared__ IndexType tmp_accum[blockSize];
+
             if (assignedOrder >= rowsCount)
                 assignedOrder = rowsCount - 1;
 
             IndexType i = rowConfig[assignedOrder];    // Row to process
 
-            size_t rowSize = rowOffsets[i + 1] - rowOffsets[i];
             size_t rowBegin = rowOffsets[i];
-
-            __shared__ IndexType tmp_accum[blockSize];
+            size_t rowSize = rowOffsets[i + 1] - rowBegin;
 
             // Initial zero
             tmp_accum[threadIdx.x] = 0;
 
             // Each thread accum nnz values
             for (size_t k = id; k < rowSize; k += threads) {
-                tmp_accum[threadIdx.x] |= v[colIndices[rowBegin + k]];
+                tmp_accum[threadIdx.x] |= __get_vec_value<IndexType>(v, vSize, colIndices[rowBegin + k]);
             }
 
             // Reduce accum to single value
@@ -104,7 +115,8 @@ namespace cubool {
             void dispatch(StreamsWrapper<Config<Bins...>> &streamsWrapper,
                           thrust::device_ptr<const IndexType> rowOffsets,    // Input csr matrix rows
                           thrust::device_ptr<const IndexType> colIndices,    // Input csr matrix col indices
-                          thrust::device_ptr<const IndexType> v,             // Input dense v vector
+                          thrust::device_ptr<const IndexType> v,             // Input sparse v vector
+                          IndexType vSize,                                   // Input sparse v vector size
                           thrust::device_ptr<IndexType> x,                   // Output dense x vector (x = M*v)
                           const std::vector<IndexType> &binSizes,            // Size of bin in rowConfig
                           const std::vector<IndexType> &binOffset,           // Offset of bin in rowConfig
@@ -114,7 +126,7 @@ namespace cubool {
                     (binSizes[Bins::id] > 0 ?
                     __spgemv<IndexType, Bins::threads, Bins::blockSize>
                     <<<binSizes[Bins::id] / Bins::dispatchRatio + (binSizes[Bins::id] % Bins::dispatchRatio ? 1 : 0), Bins::blockSize, 0, streamsWrapper.streams[Bins::id]>>>
-                    (rowOffsets, colIndices, v, x, rowConfig + binOffset[Bins::id], binSizes[Bins::id])
+                    (rowOffsets, colIndices, v, vSize, x, rowConfig + binOffset[Bins::id], binSizes[Bins::id])
                     : void())
                 );
             }
@@ -143,22 +155,10 @@ namespace cubool {
                 if (v.m_vals == 0 || m.m_vals == 0)
                     return VectorType(M);
 
-                // Resize cached buffers to store v and r as dense vectors
-                if (mInput.size() < N)
-                    mInput.resize(N);
-
+                // Resize cached buffers to store r as dense vector
                 if (mOutput.size() < M)
                     mOutput.resize(M);
 
-                // Copy v to dense vector
-                thrust::fill_n(mInput.begin(), N, (IndexType) 0);
-                thrust::for_each(v.m_rows_index.begin(), v.m_rows_index.end(),
-                [input = mInput.data()]
-                        __device__(IndexType
-                i) {
-                    input[i] = 1;
-                });
-
                 // Empty out buffer
                 thrust::fill_n(mOutput.begin(), M, (IndexType) 0);
 
@@ -228,7 +228,8 @@ namespace cubool {
                 dispatch(streamsWrapper,
                          m.m_row_index.data(),
                          m.m_col_index.data(),
-                         mInput.data(),
+                         v.m_rows_index.data(),
+                         v.m_vals,
                          mOutput.data(),
                          binSizes,
                          binOffsets,
@@ -268,7 +269,6 @@ namespace cubool {
             ContainerType<index> mRowsConfig;
             ContainerType<index> mBinsSize;
             ContainerType<index> mBinsOffsets;
-            ContainerType<index> mInput;
             ContainerType<index> mOutput;
         };
 
diff --git a/cubool/sources/cuda/kernels/spgemv_t.cuh b/cubool/sources/cuda/kernels/spgemv_t.cuh
@@ -27,80 +27,30 @@
 
 #include <cuda/details/sp_vector.hpp>
 #include <cuda/details/meta.hpp>
+#include <cuda/kernels/bin_search.cuh>
 #include <nsparse/matrix.h>
 #include <nsparse/detail/meta.h>
 #include <limits>
 
 namespace cubool {
     namespace kernels {
 
-        template<typename IndexType, size_t threads, size_t blockSize>
-        __global__ void __spgemv_t(thrust::device_ptr<const IndexType> rowOffsets,  // Input csr matrix rows
-                                   thrust::device_ptr<const IndexType> colIndices,  // Input csr matrix col indices
-                                   thrust::device_ptr<IndexType> x,                 // Output dense x vector (x = M*v)
-                                   thrust::device_ptr<const IndexType> rowConfig,   // Rows to process for each bin
-                                   IndexType rowsCount) {                           // Num of rows to process
-            // Split block into number of groups of size `threads`.
-            // Each group process its own row.
-
-            IndexType id = threadIdx.x % threads;                                           // id of the thread withing row processing group
-            IndexType interBlockId = threadIdx.x / threads;                                 // id of the group (number of `threads` belong to the same group)
-            IndexType assignedOrder = blockIdx.x * (blockSize / threads) + interBlockId;    // row, which is process by number of `threads`
-
-            if (assignedOrder >= rowsCount)
-                assignedOrder = rowsCount - 1;
-
-            IndexType i = rowConfig[assignedOrder];    // Row to process
-
-            size_t rowSize = rowOffsets[i + 1] - rowOffsets[i];
-            size_t rowBegin = rowOffsets[i];
-
-            // Add value to result
-            for (size_t k = id; k < rowSize; k += threads) {
-                x[colIndices[rowBegin + k]] = 0x1u;
-            }
-        }
-
         template<typename IndexType, typename AllocType>
         struct SpGEMVT {
             template<typename T>
             using ContainerType = thrust::device_vector<T, typename AllocType::template rebind<T>::other>;
             using MatrixType = nsparse::matrix<bool, IndexType, AllocType>;
             using VectorType = details::SpVector<IndexType, AllocType>;
 
-            template<typename ... Bins>
-            void dispatch(StreamsWrapper<Config<Bins...>> &streamsWrapper,
-                          thrust::device_ptr<const IndexType> rowOffsets,    // Input csr matrix rows
-                          thrust::device_ptr<const IndexType> colIndices,    // Input csr matrix col indices
-                          thrust::device_ptr<IndexType> x,                   // Output dense x vector (x = M*v)
-                          const std::vector<IndexType> &binSizes,            // Size of bin in rowConfig
-                          const std::vector<IndexType> &binOffset,           // Offset of bin in rowConfig
-                          thrust::device_ptr<const IndexType> rowConfig) {   // Rows to process for each bin)
-
-                EXPAND_SIDE_EFFECTS(
-                    (binSizes[Bins::id] > 0 ?
-                    __spgemv_t<IndexType, Bins::threads, Bins::blockSize>
-                    <<<binSizes[Bins::id] / Bins::dispatchRatio + (binSizes[Bins::id] % Bins::dispatchRatio ? 1 : 0), Bins::blockSize, 0, streamsWrapper.streams[Bins::id]>>>
-                    (rowOffsets, colIndices, x, rowConfig + binOffset[Bins::id], binSizes[Bins::id])
-                    : void())
-                );
-            }
-
             /**
              * Compute r = M^t x v
              *
-             * Matrix-vector multiplication algorithm:
-             * 1. Assign for each row its computation group (group defines number of threads used for processing)
-             * 2. Run each group (larger row - more threads must be assigned)
-             *
              * @param v Sparse vector
              * @param m Sparse matrix
              *
              * @return Sparse vector
              */
             VectorType operator()(const VectorType &v, const MatrixType &m) {
-                static constexpr size_t max = std::numeric_limits<size_t>::max();
-
                 auto N = m.m_cols;
                 auto vnvals = v.m_vals;
 
@@ -117,81 +67,38 @@ namespace cubool {
                 // Empty out buffer
                 thrust::fill_n(mOutput.begin(), N, (IndexType) 0);
 
-                using ConfigType = Config<Bin<4, 32, 1, 8, 0>,
-                                          Bin<8, 32, 8, 16, 1>,
-                                          Bin<16, 32, 16, 32, 2>,
-                                          Bin<32, 32, 32, 64, 3>,
-                                          Bin<64, 64, 64, 128, 4>,
-                                          Bin<128, 128, 128, 256, 5>,
-                                          Bin<256, 256, 256, max, 6>>;
-                ConfigType config;
-
-                // Process only rows, where vec value is non-zero
-                mRowsConfig.resize(vnvals);
-
-                mBinsSize.resize(config.binsCount());
-                mBinsOffsets.resize(config.binsCount());
-
-                thrust::fill(mBinsSize.begin(), mBinsSize.end(), (IndexType) 0);
-
-                // Eval bins size for each row (look-up row by vector value)
-                thrust::for_each(v.m_rows_index.begin(), v.m_rows_index.end(),
-                                 [config, binSize = mBinsSize.data(), rowsIndices = m.m_row_index.data()]
-                                         __device__(IndexType i) {
-                                     auto valsInRow = rowsIndices[i + 1] - rowsIndices[i];
-                                     auto binId = config.selectBin(valsInRow);
-
-                                     if (binId == config.unusedBinId())
-                                         // Ignore empty rows
-                                         return;
-
-                                     atomicAdd((binSize + binId).get(), 1);
-                                 });
-
-                // Offsets for each bin (each bin will have its own section in permutation buffer)
-                thrust::exclusive_scan(mBinsSize.begin(), mBinsSize.end(), mBinsOffsets.begin(), 0, thrust::plus<IndexType>());
-
-                // Reset bin sizes (use as offsets for permutation id assignments)
-                thrust::fill(mBinsSize.begin(), mBinsSize.end(), (IndexType) 0);
-
-                // Assign rows () for its bins
-                thrust::for_each(v.m_rows_index.begin(), v.m_rows_index.end(),
-                                 [config, binSize = mBinsSize.data(), binOffset = mBinsOffsets.data(), rowsConfig = mRowsConfig.data(),
-                                         rowsIndices = m.m_row_index.data()]
-                                         __device__(IndexType i) {
-                                     auto valsInRow = rowsIndices[i + 1] - rowsIndices[i];
-                                     auto binId = config.selectBin(valsInRow);
-
-                                     if (binId == config.unusedBinId())
-                                         // Ignore empty rows
-                                         return;
-
-                                     auto order = atomicAdd((binSize + binId).get(), 1);
-                                     rowsConfig[binOffset[binId] + order] = i;
-                                 });
+                // Count number of rows of matrix m to process (based on v)
+                ContainerType<IndexType> configTmp(vnvals + 1);
+                ContainerType<IndexType> config(vnvals + 1);
 
-                // Bring to the host bin sizes and offsets
-                std::vector<IndexType> binOffsets(mBinsOffsets.size());
-                thrust::copy(mBinsOffsets.begin(), mBinsOffsets.end(), binOffsets.begin());
+                thrust::for_each(thrust::counting_iterator<IndexType>(0), thrust::counting_iterator<IndexType>(vnvals),
+                        [vIndices = v.m_rows_index.data(), mRowOffset = m.m_row_index.data(), config = configTmp.data()]
+                        __device__(IndexType id) {
+                    auto rowId = vIndices[id];
+                    auto valuesCount = mRowOffset[rowId + 1] - mRowOffset[rowId];
+                    config[id] = valuesCount;
+                });
 
-                std::vector<IndexType> binSizes(mBinsSize.size());
-                thrust::copy(mBinsSize.begin(), mBinsSize.end(), binSizes.begin());
+                // Offset of each row min-group
+                thrust::exclusive_scan(configTmp.begin(), configTmp.end(), config.begin(), 0, thrust::plus<IndexType>());
 
-                // Stream for each bin
-                StreamsWrapper<ConfigType> streamsWrapper;
+                IndexType totalToProcess = config.back();
 
-                dispatch(streamsWrapper,
-                         m.m_row_index.data(),
-                         m.m_col_index.data(),
-                         mOutput.data(),
-                         binSizes,
-                         binOffsets,
-                         mRowsConfig.data());
+                // For each value in selected rows run and compute non-zero output result values
+                thrust::for_each(thrust::counting_iterator<IndexType>(0), thrust::counting_iterator<IndexType>(totalToProcess),
+                        [result = mOutput.data(), config = config.data(), rows = v.m_vals,
+                         rowOffsets = m.m_row_index.data(), colIndex = m.m_col_index.data(), vIndices = v.m_rows_index.data()]
+                        __device__(IndexType id) {
+                        auto configRow = kernels::findNearestRowIdx<IndexType>(id, rows, config); // Find config slot to process
+                        auto valueInRow = id - config[configRow];                                 // Find value relative id in row to process
+                        auto rowId = vIndices[configRow];                                         // Find row to process
+                        auto valueIdx = colIndex[rowOffsets[rowId] + valueInRow];                 // Get actual col index
 
-                cudaDeviceSynchronize();
+                        result[valueIdx] = 0x1u;                                              // This value in result vector is non-zero
+                });
 
                 // Nnz of the result
-                auto resultSize = thrust::reduce(mOutput.begin(), mOutput.end(), (IndexType) 0);
+                auto resultSize = thrust::reduce(mOutput.begin(), mOutput.begin() + N, (IndexType) 0);
 
                 ContainerType<index> result(resultSize);
 
@@ -219,10 +126,7 @@ namespace cubool {
             }
 
         private:
-            ContainerType<index> mRowsConfig;
-            ContainerType<index> mBinsSize;
-            ContainerType<index> mBinsOffsets;
-            ContainerType<index> mOutput;
+            ContainerType<IndexType> mOutput;
         };
 
     }