-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkernel0.cu
More file actions
91 lines (73 loc) · 3.71 KB
/
kernel0.cu
File metadata and controls
91 lines (73 loc) · 3.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#include "common.h"
#include "timer.h"
// Kernel 0: Basic GPU parallel implementation
// Each thread processes one row of the output matrix
// Uses externally allocated output pools (following CPU pattern)
// No advanced optimizations - baseline implementation
// --- RTX 6000 / RTX 4090 / V100 / T4 (48KB shared memory per block) ---
#define BLOCK_SIZE 256
// For bigger GPU, we can try using larger block size
// --- A100 (164KB shared memory per block) ---
// #define BLOCK_SIZE 512
// --- H100 (228KB shared memory per block) ---
// #define BLOCK_SIZE 512
// =============================================================================
__global__ void spmspm_kernel0(CSRMatrix* csrMatrix1_d, CSRMatrix* csrMatrix2_d, COOMatrix* cooMatrix_d,
unsigned int* outputColsPool, float* outputValuesPool, unsigned int numCols) {
unsigned int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row < csrMatrix1_d->numRows) {
// Each thread gets its own portion of the pool (like CPU's outputValues and outputCols)
float* outputValues = &outputValuesPool[row * numCols];
unsigned int* outputCols = &outputColsPool[row * numCols];
unsigned int numOutputCols = 0;
// Initialize output values to 0 (dense accumulator indexed by column)
for (unsigned int c = 0; c < numCols; ++c) {
outputValues[c] = 0.0f;
}
// Iterate over non-zeros in row of matrix A
for (unsigned int i1 = csrMatrix1_d->rowPtrs[row]; i1 < csrMatrix1_d->rowPtrs[row + 1]; ++i1) {
unsigned int col1 = csrMatrix1_d->colIdxs[i1];
float value1 = csrMatrix1_d->values[i1];
unsigned int row2 = col1;
// Iterate over non-zeros in row2 of matrix B
for (unsigned int i2 = csrMatrix2_d->rowPtrs[row2]; i2 < csrMatrix2_d->rowPtrs[row2 + 1]; ++i2) {
unsigned int col2 = csrMatrix2_d->colIdxs[i2];
float value2 = csrMatrix2_d->values[i2];
float oldVal = outputValues[col2];
outputValues[col2] += value1 * value2;
if (oldVal == 0.0f) {
outputCols[numOutputCols++] = col2;
}
}
}
// Write accumulated values to COO output
if (numOutputCols > 0) {
unsigned int startIdx = atomicAdd(&cooMatrix_d->numNonzeros, numOutputCols);
for (unsigned int i = 0; i < numOutputCols; ++i) {
unsigned int col = outputCols[i];
cooMatrix_d->rowIdxs[startIdx + i] = row;
cooMatrix_d->colIdxs[startIdx + i] = col;
cooMatrix_d->values[startIdx + i] = outputValues[col];
}
}
}
}
void spmspm_gpu0(CSRMatrix* csrMatrix1, CSRMatrix* csrMatrix2, CSRMatrix* csrMatrix1_d,
CSRMatrix* csrMatrix2_d, COOMatrix* cooMatrix_d) {
unsigned int numRows = csrMatrix1->numRows;
unsigned int numCols = csrMatrix2->numCols;
unsigned int blockSize = BLOCK_SIZE;
unsigned int numBlocks = (numRows + blockSize - 1) / blockSize;
// Allocate output pools on GPU (following instructor's suggestion)
unsigned int* outputColsPool;
float* outputValuesPool;
cudaMalloc((void**)&outputColsPool, numRows * numCols * sizeof(unsigned int));
cudaMalloc((void**)&outputValuesPool, numRows * numCols * sizeof(float));
// Launch kernel with pools
spmspm_kernel0<<<numBlocks, blockSize>>>(csrMatrix1_d, csrMatrix2_d, cooMatrix_d,
outputColsPool, outputValuesPool, numCols);
// Free pools
cudaDeviceSynchronize();
cudaFree(outputColsPool);
cudaFree(outputValuesPool);
}