Skip to content

Commit 9bfeff9

Browse files
committed
Merge remote-tracking branch 'upstream/master'
1 parent 991e29f commit 9bfeff9

File tree

3 files changed

+77
-64
lines changed

3 files changed

+77
-64
lines changed

Common/include/linear_algebra/CGraphPartitioning.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
8282

8383
CLevelScheduling() = delete; // Removing default constructor
8484

85-
void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType>& levelOffsets)
85+
void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType> levelOffsets)
8686
{
8787
for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
8888
const auto globalPoint = pointList[localPoint];

Common/include/linear_algebra/GPUComms.cuh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ namespace kernelParameters{
3636
/*Returns the rounded down value of the decimal quotient to the previous integer (in all cases)*/
3737
inline constexpr int rounded_down_division(const int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
3838

39-
static constexpr short MVP_BLOCK_SIZE = 1024;
39+
static constexpr short MVP_BLOCK_SIZE = 256;
4040
static constexpr short MVP_WARP_SIZE = 32;
4141

4242
};
@@ -62,12 +62,16 @@ struct matrixParameters{
6262
activeThreads = nVar * (kernelParameters::MVP_WARP_SIZE/nVar);
6363
}
6464

65+
__device__ unsigned short shrdMemIndex(unsigned short localRow, unsigned short threadNo){
66+
return (localRow * blockSize + threadNo);
67+
}
68+
6569
__device__ bool validAccess(unsigned long row, unsigned short threadNo, unsigned short threadLimit){
6670
return (row<totalRows && threadNo<threadLimit);
6771
}
6872

6973
__device__ bool validParallelAccess(bool rowInPartition, unsigned short threadNo, unsigned short threadLimit){
70-
return (rowInPartition && threadNo<activeThreads);
74+
return (rowInPartition && threadNo<threadLimit);
7175
}
7276

7377
};

Common/src/linear_algebra/CSysMatrixGPU.cu

Lines changed: 70 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -33,34 +33,44 @@ using namespace kernelParameters;
3333

3434

3535
template<typename matrixType, typename vectorType>
36-
__device__ void GaussEliminationKernel(matrixType* matrixCopy, vectorType* prod, unsigned int threadNo, matrixParameters matrixParam)
36+
__device__ void GaussEliminationKernel(matrixType* matrixCopy, vectorType* prod, unsigned long row, unsigned int threadNo, bool rowInPartition, matrixParameters matrixParam)
3737
{
38-
#define A(I, J) matrixCopy[(I) * matrixParam.blockColSize + (J)]
38+
auto matrixIndex = [=](unsigned short x, unsigned short y){
39+
return (x * matrixParam.blockColSize + y);
40+
};
3941

40-
int x = threadNo/matrixParam.blockRowSize;
41-
int y = threadNo % matrixParam.blockColSize;
42+
auto vectorIndex = [=](unsigned short row, unsigned short elemNo){
43+
return (row * matrixParam.blockRowSize + elemNo);
44+
};
45+
46+
unsigned short x = threadNo/matrixParam.blockRowSize;
47+
unsigned short y = threadNo % matrixParam.blockColSize;
4248

4349
matrixType pivot, weight = 0.0;
4450

4551
__syncthreads();
4652

47-
for(int currentRow=0; currentRow < matrixParam.blockRowSize; currentRow++)
53+
for(unsigned short currentRow=0; currentRow < matrixParam.blockRowSize; currentRow++)
4854
{
49-
pivot = A(currentRow, currentRow);
50-
55+
if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockColSize)){
56+
pivot = matrixCopy[matrixIndex(currentRow, currentRow)];
57+
}
5158
__syncthreads();
5259

53-
A(currentRow, y) /= pivot;
54-
if(y==0) prod[currentRow] /= pivot;
60+
if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockColSize)){
61+
62+
matrixCopy[matrixIndex(currentRow, y)] /= pivot;
63+
if(y==0) prod[vectorIndex(row, currentRow)] /= pivot;
5564

56-
weight = A(x, currentRow);
65+
weight = matrixCopy[matrixIndex(x, currentRow)];
5766

58-
if(x!=currentRow)
59-
{
60-
A(x, y) -= weight * A(currentRow, y);
61-
if(y==0) prod[x] -= weight * prod[currentRow];
67+
if(x!=currentRow)
68+
{
69+
matrixCopy[matrixIndex(x, y)] -= weight * matrixCopy[matrixIndex(currentRow, y)];
70+
if(y==0) prod[vectorIndex(row, x)] -= weight * prod[vectorIndex(row, x)];
71+
}
6272
}
63-
73+
6474
__syncthreads();
6575
}
6676

@@ -71,69 +81,66 @@ template<typename matrixType, typename vectorType>
7181
__global__ void FirstSymmetricIterationKernel(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_partition_offsets, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, matrixParameters matrixParam)
7282
{
7383

74-
auto matrixIndex = [=](unsigned long row, unsigned short threadNo){
75-
return (d_row_ptr[row] * matrixParam.blockSize + threadNo);
76-
};
77-
78-
auto matrixDiagonalIndex = [=](unsigned long row, unsigned short threadNo){
79-
return (d_dia_ptr[row] * matrixParam.blockSize + threadNo);
84+
auto matrixIndex = [=](unsigned long row, unsigned short threadNo, const unsigned long* ptr_array){
85+
return (ptr_array[row] * matrixParam.blockSize + threadNo);
8086
};
8187

8288
auto vectorIndex = [=](unsigned long row, unsigned short elemNo){
8389
return (row * matrixParam.blockRowSize + elemNo);
8490
};
8591

86-
unsigned long row = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE;
87-
unsigned short localRow = row % matrixParam.rowsPerBlock;
92+
auto vectorMultIndex = [=](unsigned short blockNo, unsigned short blockCol){
93+
return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol);
94+
};
95+
96+
unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE;
97+
unsigned short localRow = origRow % matrixParam.rowsPerBlock;
8898
unsigned short threadNo = threadIdx.x % MVP_WARP_SIZE;
8999

90100
unsigned short blockCol = threadNo % matrixParam.blockColSize;
91101

92-
extern __shared__ vectorType tempBuffer[];
102+
extern __shared__ matrixType tempBuffer[];
93103

94-
for(auto iPartition = 1ul; iPartition < matrixParam.nPartition; iPartition++)
104+
for(auto iPartition = 1ul; iPartition < matrixParam.nPartition + 1; iPartition++)
95105
{
96106

97107
/*We move to the set of rows present in the current partition.*/
98-
row += d_partition_offsets[iPartition-1];
99-
108+
unsigned long row = origRow + d_partition_offsets[iPartition-1];
100109
bool rowInPartition = (row < d_partition_offsets[iPartition]);
101110

102111
if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockRowSize)) {
103112

104113
prod[vectorIndex(row, threadNo)] = 0.0;
105-
tempBuffer[vectorIndex(localRow, threadNo)] = 0.0;
114+
tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = 0.0;
106115
}
107116

108117
__syncthreads();
109118

110119
if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.activeThreads)) {
111120

112-
for(unsigned long index = matrixIndex(row, threadNo); index < matrixDiagonalIndex(row, 0); index+=matrixParam.activeThreads)
121+
for(unsigned long index = matrixIndex(row, threadNo, d_row_ptr); index < matrixIndex(row, 0, d_dia_ptr); index+=matrixParam.activeThreads)
113122
{
114-
123+
115124
unsigned short blockNo = index/matrixParam.blockSize;
116125
unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;
117126

118-
atomicAdd(&tempBuffer[vectorIndex(localRow, blockRow)], matrix[index] * vec[(d_col_ind[blockNo]) * matrixParam.blockColSize + blockCol]);
127+
atomicAdd(&tempBuffer[matrixParam.shrdMemIndex(localRow, blockRow)], matrix[index] * vec[vectorMultIndex(blockNo, blockCol)]);
119128
}
120129
}
121130

122131
__syncthreads();
123132

124-
if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockRowSize)) {
125-
126-
prod[vectorIndex(row, threadNo)] = vec[vectorIndex(row, threadNo)] - tempBuffer[vectorIndex(localRow, threadNo)];
127-
}
128-
129-
__syncthreads();
130-
131133
if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)) {
132134

133-
tempBuffer[vectorIndex(localRow, threadNo)] = matrix[matrixDiagonalIndex(row, threadNo)];
134-
GaussEliminationKernel(&tempBuffer[vectorIndex(localRow, threadNo)], &prod[vectorIndex(row, threadNo/matrixParam.blockColSize)], threadNo, matrixParam);
135+
/*Write the result to the product vector.*/
136+
if(threadNo < matrixParam.blockRowSize) prod[vectorIndex(row, threadNo)] = vec[vectorIndex(row, threadNo)] - tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)];
137+
138+
/*Reinitialize the memory to hold the matrix diagonal elements now.*/
139+
tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = matrix[matrixIndex(row, threadNo, d_dia_ptr)];
135140
}
136141

142+
GaussEliminationKernel(&tempBuffer[matrixParam.shrdMemIndex(localRow, 0)], prod, row, threadNo, rowInPartition, matrixParam);
143+
137144
__syncthreads();
138145

139146
}
@@ -144,18 +151,18 @@ template<typename matrixType, typename vectorType>
144151
__global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* prod, const unsigned long* d_partition_offsets, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, const unsigned long* d_dia_ptr, matrixParameters matrixParam)
145152
{
146153

147-
auto matrixIndex = [=](unsigned long row, unsigned short threadNo){
148-
return (d_row_ptr[row] * matrixParam.blockSize + threadNo);
149-
};
150-
151-
auto matrixDiagonalIndex = [=](unsigned long row, unsigned short threadNo){
152-
return (d_dia_ptr[row] * matrixParam.blockSize + threadNo);
154+
auto matrixIndex = [=](unsigned long row, unsigned short threadNo, const unsigned long* ptr_array){
155+
return (ptr_array[row] * matrixParam.blockSize + threadNo);
153156
};
154157

155158
auto vectorIndex = [=](unsigned long row, unsigned short elemNo){
156159
return (row * matrixParam.blockRowSize + elemNo);
157160
};
158161

162+
auto vectorMultIndex = [=](unsigned short blockNo, unsigned short blockCol){
163+
return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol);
164+
};
165+
159166
unsigned long row = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE;
160167
unsigned short localRow = row % matrixParam.rowsPerBlock;
161168
unsigned short threadNo = threadIdx.x % MVP_WARP_SIZE;
@@ -164,58 +171,60 @@ __global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* p
164171

165172
extern __shared__ vectorType tempBuffer[];
166173

167-
for(auto iPartition = 1ul; iPartition < matrixParam.nPartition; iPartition++)
174+
for(auto iPartition = 1ul; iPartition < matrixParam.nPartition + 1; iPartition++)
168175
{
169176

170177
/*We move to the set of rows present in the current partition.*/
178+
row = (blockIdx.x * blockDim.x + threadIdx.x)/MVP_WARP_SIZE;
171179
row += d_partition_offsets[iPartition-1];
172180

173181
bool rowInPartition = (row < d_partition_offsets[iPartition]);
174182

175183
if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockRowSize)) {
176184

177-
tempBuffer[vectorIndex(localRow, threadNo)] = 0.0;
185+
tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = 0.0;
178186
}
179187

180188
__syncthreads();
181189

182190
if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.activeThreads)) {
183191

184-
for(unsigned long index = matrixDiagonalIndex(row, threadNo); index < matrixDiagonalIndex(row, 0) + matrixParam.blockSize; index+=matrixParam.activeThreads)
192+
for(unsigned long index = matrixIndex(row, threadNo, d_dia_ptr); index < matrixIndex(row, matrixParam.blockSize, d_dia_ptr); index+=matrixParam.activeThreads)
185193
{
186194

187195
unsigned short blockNo = index/matrixParam.blockSize;
188196
unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;
189197

190-
atomicAdd(&tempBuffer[vectorIndex(localRow, blockRow)], matrix[index] * prod[(d_col_ind[blockNo]) * matrixParam.blockColSize + blockCol]);
198+
atomicAdd(&tempBuffer[matrixParam.shrdMemIndex(localRow, blockRow)], matrix[index] * prod[vectorMultIndex(blockNo, blockCol)]);
191199
}
192-
}
193200

194-
if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.activeThreads)) {
195-
196-
for(unsigned long index = matrixDiagonalIndex(row, threadNo); index < matrixIndex(row+1, 0); index+=matrixParam.activeThreads)
201+
for(unsigned long index = matrixIndex(row, threadNo, d_dia_ptr); index < matrixIndex(row+1, 0, d_row_ptr); index+=matrixParam.activeThreads)
197202
{
198203

199204
unsigned short blockNo = index/matrixParam.blockSize;
200205
unsigned short blockRow = (index/matrixParam.blockColSize) % matrixParam.blockRowSize;
201206

202-
atomicAdd(&tempBuffer[vectorIndex(localRow, blockRow)], -(matrix[index] * prod[(d_col_ind[blockNo]) * matrixParam.blockColSize + blockCol]));
207+
atomicAdd(&tempBuffer[matrixParam.shrdMemIndex(localRow, blockRow)], -(matrix[index] * prod[vectorMultIndex(blockNo, blockCol)]));
203208
}
204209
}
205210

206211
__syncthreads();
207212

208-
if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockRowSize)) {
209-
210-
prod[vectorIndex(row, threadNo)] = tempBuffer[vectorIndex(localRow, threadNo)];
213+
if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)) {
214+
215+
/*Write the result to the product vector.*/
216+
if(threadNo < matrixParam.blockRowSize) prod[vectorIndex(row, threadNo)] = tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)];
217+
218+
/*Reinitialize the memory to hold the matrix diagonal elements now.*/
219+
tempBuffer[matrixParam.shrdMemIndex(localRow, threadNo)] = matrix[matrixIndex(row, threadNo, d_dia_ptr)];
211220
}
212221

213222
__syncthreads();
214223

215224
if(matrixParam.validParallelAccess(rowInPartition, threadNo, matrixParam.blockSize)) {
216225

217-
tempBuffer[vectorIndex(localRow, threadNo)] = matrix[(d_dia_ptr[row] * matrixParam.blockSize) + threadNo];
218-
GaussEliminationKernel(&tempBuffer[vectorIndex(localRow, threadNo)], &prod[vectorIndex(row, threadNo/matrixParam.blockColSize)], threadNo, matrixParam);
226+
tempBuffer[localRow * matrixParam.blockSize + threadNo] = matrix[(d_dia_ptr[row] * matrixParam.blockSize) + threadNo];
227+
GaussEliminationKernel(&tempBuffer[localRow * matrixParam.blockSize], prod, row, threadNo, rowInPartition, matrixParam);
219228
}
220229

221230
__syncthreads();
@@ -294,7 +303,7 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
294303
unsigned int gridx = rounded_up_division(MVP_BLOCK_SIZE, matrixParam.totalRows * MVP_WARP_SIZE);
295304
dim3 gridDim(gridx, 1, 1);
296305

297-
MatrixVectorProductKernel<<<gridDim, blockDim>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
306+
MatrixVectorProductKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockRowSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
298307
gpuErrChk( cudaPeekAtLastError() );
299308

300309
prod.DtHTransfer();

0 commit comments

Comments
 (0)