Skip to content

Commit cdbc05b

Browse files
author
Ubuntu
committed
Fixes to work with CUDA 12 toolkit
1 parent 1b07b59 commit cdbc05b

File tree

4 files changed

+99
-66
lines changed

4 files changed

+99
-66
lines changed

src/cudamatrix/cu-kernels.cu

Lines changed: 40 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -953,11 +953,12 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
953953
}
954954

955955
// Warp reduce. Implicitly synchronized within a warp.
956-
if (tid < warpSize) {
957956
# pragma unroll
958-
for (int shift = warpSize; shift > 0; shift >>= 1) {
957+
for (int shift = warpSize; shift > 0; shift >>= 1) {
958+
if (tid < warpSize) {
959959
smem.sum[tid] += smem.sum[tid + shift];
960960
}
961+
__syncwarp();
961962
}
962963

963964
// output 1 sum per thread block
@@ -1206,11 +1207,12 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M,
12061207
}
12071208

12081209
// Warp reduce to 1 element. Threads implicitly synchronized within a warp.
1209-
if (tid < warpSize) {
12101210
# pragma unroll
1211-
for (int shift = warpSize; shift > 0; shift >>= 1) {
1212-
ssum[tid] += ssum[tid + shift];
1213-
}
1211+
for (int shift = warpSize; shift > 0; shift >>= 1) {
1212+
if (tid < warpSize) {
1213+
ssum[tid] += ssum[tid + shift];
1214+
}
1215+
__syncwarp();
12141216
}
12151217

12161218
// output 1 sum per thread block
@@ -1257,12 +1259,13 @@ static void _add_diag_mat_mat_MTN(const Real alpha, const Real* M,
12571259

12581260
// Warp reduce to 1 element per column.
12591261
// Threads implicitly synchronized within a warp.
1260-
if (tid < warpSize) {
12611262
# pragma unroll
12621263
for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
1263-
ssum[tid] += ssum[tid + shift];
1264+
if (tid < warpSize) {
1265+
ssum[tid] += ssum[tid + shift];
1266+
}
1267+
__syncwarp();
12641268
}
1265-
}
12661269

12671270
// output TileDim sums per thread block
12681271
if (tid < TileDim) {
@@ -1340,13 +1343,13 @@ static void _add_diag_mat_mat_MN(const Real alpha, const Real* M,
13401343

13411344
// Warp reduce to 1 element per column.
13421345
// Threads implicitly synchronized within a warp.
1343-
if (tid < warpSize) {
13441346
# pragma unroll
1345-
for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
1347+
for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
1348+
if (tid < warpSize) {
13461349
smem.sum[tid] += smem.sum[tid + shift];
13471350
}
1351+
__syncwarp();
13481352
}
1349-
13501353
// output TileDim sums per thread block
13511354
if (tid < TileDim && j_n < dim_N.cols) {
13521355
v[j_n] = alpha * smem.sum[tid] + beta * v[j_n];
@@ -1793,10 +1796,11 @@ static void _vec_transform_reduce(
17931796
}
17941797

17951798
// Reduce last warp. Threads implicitly synchronized within a warp.
1796-
if (tid < warpSize) {
1797-
for (int shift = warpSize; shift > 0; shift >>= 1) {
1799+
for (int shift = warpSize; shift > 0; shift >>= 1) {
1800+
if (tid < warpSize) {
17981801
sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
17991802
}
1803+
__syncwarp();
18001804
}
18011805

18021806
// Output to vector result.
@@ -2006,9 +2010,11 @@ static void _transform_reduce_mat_rows(
20062010
}
20072011

20082012
// Reduce last warp. Threads implicitly synchronized within a warp.
2009-
if (tid < warpSize) {
2010-
for (int shift = warpSize; shift > 0; shift >>= 1)
2013+
for (int shift = warpSize; shift > 0; shift >>= 1) {
2014+
if (tid < warpSize) {
20112015
sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
2016+
}
2017+
__syncwarp();
20122018
}
20132019

20142020
// Output to vector result.
@@ -2045,11 +2051,13 @@ static void _transform_reduce_mat_cols(
20452051
}
20462052

20472053
// Reduce last warp. Threads implicitly synchronized within a warp.
2048-
if (tid < warpSize) {
2049-
for (int shift = warpSize; shift > 0; shift >>= 1)
2054+
for (int shift = warpSize; shift > 0; shift >>= 1) {
2055+
if (tid < warpSize) {
20502056
sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
2057+
}
2058+
__syncwarp();
20512059
}
2052-
2060+
20532061
// Output to vector result.
20542062
if (tid == 0) {
20552063
result[i] = op.PostReduce(sdata[0], result[i]);
@@ -2087,28 +2095,25 @@ static void _group_transform_reduce(
20872095
x_idx += threads_per_group;
20882096
}
20892097
sreduction[tid] = treduction;
2090-
if (threads_per_group > warpSize) {
2091-
__syncthreads();
2092-
}
2098+
__syncthreads();
20932099

20942100
// tree-reduce to 2x warpSize elements per group
20952101
# pragma unroll
2096-
for (int shift = threads_per_group / 2; shift > warpSize; shift >>= 1) {
2102+
int shift = threads_per_group / 2;
2103+
for (; shift > warpSize; shift >>= 1) {
20972104
if (threadIdx.x < shift) {
20982105
sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);
20992106
}
21002107
__syncthreads();
21012108
}
21022109

21032110
// Warp-reduce to 1 element per group.
2104-
// Threads implicitly synchronized within the warp.
2105-
const int warp_reduce_size =
2106-
threads_per_group / 2 < warpSize ? threads_per_group / 2 : warpSize;
2107-
if (threadIdx.x < warp_reduce_size) {
21082111
# pragma unroll
2109-
for (int shift = warp_reduce_size; shift > 0; shift >>= 1) {
2112+
for (; shift > 0; shift >>= 1) {
2113+
if (threadIdx.x < shift) {
21102114
sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);
21112115
}
2116+
__syncwarp();
21122117
}
21132118

21142119
// Store the result.
@@ -2967,12 +2972,13 @@ static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv,
29672972
}
29682973

29692974
// reduce to 1 element per row
2970-
if (tid < warpSize) {
29712975
# pragma unroll
2972-
for (int shift = warpSize; shift > 0; shift >>= 1) {
2976+
for (int shift = warpSize; shift > 0; shift >>= 1) {
2977+
if (tid < warpSize) {
29732978
sprod[tid] += sprod[tid + shift];
29742979
snorm[tid] += snorm[tid + shift];
29752980
}
2981+
__syncwarp();
29762982
}
29772983

29782984
// broadcast the sum results
@@ -3254,15 +3260,16 @@ static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id,
32543260
}
32553261
// Warp reduce without __syncthreads()
32563262
// (note.: synchronizes implicitly within a warp at the multiprocessor)
3257-
if (tid < warpSize / 2) {
32583263
#pragma unroll
3259-
for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0;
3260-
num_working_threads >>= 1) {
3264+
for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0;
3265+
num_working_threads >>= 1) {
3266+
if (tid < warpSize / 2) {
32613267
if (smax[tid + num_working_threads] > smax[tid]) {
32623268
smax[tid] = smax[tid + num_working_threads];
32633269
sidx[tid] = sidx[tid + num_working_threads];
32643270
}
32653271
}
3272+
__syncwarp();
32663273
}
32673274

32683275
if (tid == 0) {

src/cudamatrix/cu-sparse-matrix-test.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,8 @@ static void UnitTestCuSparseMatrixSelectRowsAndTranspose() {
125125
template <typename Real>
126126
static void UnitTestCuSparseMatrixTraceMatSmat() {
127127
for (int32 i = 0; i < 2; i++) {
128-
MatrixIndexT row = 10 + Rand() % 40;
129-
MatrixIndexT col = 10 + Rand() % 50;
128+
MatrixIndexT row = 2 + Rand() % 3;
129+
MatrixIndexT col = 1 + Rand() % 4;
130130

131131
CuMatrix<Real> mat1(row, col);
132132
CuMatrix<Real> mat2(col, row);
@@ -147,11 +147,13 @@ static void UnitTestCuSparseMatrixTraceMatSmat() {
147147
cu_smat2.CopyToMat(&mat2);
148148

149149
Real trace1 = TraceMatMat(mat3, mat1, kTrans);
150+
150151
Real trace2 = TraceMatSmat(mat3, cu_smat1, kTrans);
151152
AssertEqual(trace1, trace2, 0.00001);
152153

153154
trace1 = TraceMatMat(mat3, mat2, kNoTrans);
154155
trace2 = TraceMatSmat(mat3, cu_smat2, kNoTrans);
156+
155157
AssertEqual(trace1, trace2, 0.00001);
156158
}
157159
}

src/cudamatrix/cu-sparse-matrix.cc

Lines changed: 44 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ void CuSparseMatrix<Real>::SelectRows(const CuArray<int32> &row_indexes,
161161
template<typename Real>
162162
CuSparseMatrix<Real>::CuSparseMatrix(const CuArray<int32> &indexes, int32 dim,
163163
MatrixTransposeType trans) :
164-
num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
164+
num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_(
165165
NULL) {
166166
#if HAVE_CUDA == 1
167167
if (CuDevice::Instantiate().Enabled()) {
@@ -194,8 +194,8 @@ template<typename Real>
194194
CuSparseMatrix<Real>::CuSparseMatrix(const CuArray<int32> &indexes,
195195
const CuVectorBase<Real> &weights,
196196
int32 dim, MatrixTransposeType trans) :
197-
num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
198-
NULL) {
197+
num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL),
198+
csr_val_(NULL) {
199199
#if HAVE_CUDA == 1
200200
if (CuDevice::Instantiate().Enabled()) {
201201
Resize(indexes.Dim(), dim, indexes.Dim(), kUndefined);
@@ -266,8 +266,9 @@ void CuSparseMatrix<Real>::Resize(const MatrixIndexT num_rows,
266266
num_rows_ = 0;
267267
num_cols_ = 0;
268268
nnz_ = 0;
269-
csr_row_ptr_col_idx_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
269+
csr_row_ptr_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
270270
1 * sizeof(int)));
271+
csr_col_idx_ = NULL; // may be freed, but this is allowed.
271272
csr_val_ = NULL;
272273
} else {
273274
KALDI_ASSERT(num_rows > 0);
@@ -277,10 +278,16 @@ void CuSparseMatrix<Real>::Resize(const MatrixIndexT num_rows,
277278
num_rows_ = num_rows;
278279
num_cols_ = num_cols;
279280
nnz_ = nnz;
280-
csr_row_ptr_col_idx_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
281-
(num_rows + 1 + nnz) * sizeof(int)));
282-
csr_val_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(
281+
csr_row_ptr_ = static_cast<int*>(CuDevice::Instantiate().Malloc((num_rows + 1) * sizeof(int)));
282+
if (nnz > 0) {
283+
csr_col_idx_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
284+
nnz * sizeof(int)));
285+
csr_val_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(
283286
nnz * sizeof(Real)));
287+
} else {
288+
csr_col_idx_ = NULL;
289+
csr_val_ = NULL;
290+
}
284291
CuSubArray<int> row_ptr(CsrRowPtr(), NumRows() + 1);
285292
row_ptr.Set(nnz);
286293
if (resize_type == kSetZero) {
@@ -302,16 +309,20 @@ void CuSparseMatrix<Real>::Destroy() {
302309
#if HAVE_CUDA == 1
303310
if (CuDevice::Instantiate().Enabled()) {
304311
CuTimer tim;
305-
if (csr_row_ptr_col_idx_) {
306-
CuDevice::Instantiate().Free(csr_row_ptr_col_idx_);
312+
if (csr_row_ptr_) {
313+
CuDevice::Instantiate().Free(csr_row_ptr_);
314+
}
315+
if (csr_col_idx_) {
316+
CuDevice::Instantiate().Free(csr_col_idx_);
307317
}
308318
if (csr_val_) {
309319
CuDevice::Instantiate().Free(csr_val_);
310320
}
311321
num_rows_ = 0;
312322
num_cols_ = 0;
313323
nnz_ = 0;
314-
csr_row_ptr_col_idx_ = NULL;
324+
csr_row_ptr_ = NULL;
325+
csr_col_idx_ = NULL;
315326
csr_val_ = NULL;
316327
CuDevice::Instantiate().AccuProfile(__func__, tim);
317328
} else
@@ -378,11 +389,17 @@ void CuSparseMatrix<Real>::CopyFromSmat(const CuSparseMatrix<Real>& smat,
378389
CuSubVector<Real> val_from(smat.CsrVal(), smat.NumElements());
379390
val_to.CopyFromVec(val_from);
380391

381-
CuSubArray<int> idx_to(csr_row_ptr_col_idx_,
382-
NumRows() + 1 + NumElements());
383-
CuSubArray<int> idx_from(smat.csr_row_ptr_col_idx_,
384-
smat.NumRows() + 1 + smat.NumElements());
385-
idx_to.CopyFromArray(idx_from);
392+
{
393+
CuSubArray<int> idx_to(csr_row_ptr_, NumRows() + 1);
394+
CuSubArray<int> idx_from(smat.csr_row_ptr_, NumRows() + 1);
395+
idx_to.CopyFromArray(idx_from);
396+
}
397+
398+
{
399+
CuSubArray<int> idx_to(csr_col_idx_, NumElements());
400+
CuSubArray<int> idx_from(smat.csr_col_idx_, NumElements());
401+
idx_to.CopyFromArray(idx_from);
402+
}
386403

387404
} else {
388405
Resize(smat.NumCols(), smat.NumRows(), smat.NumElements(), kUndefined);
@@ -413,9 +430,14 @@ void CuSparseMatrix<Real>::CopyToSmat(SparseMatrix<OtherReal> *smat) const {
413430
smat->Resize(0, 0);
414431
return;
415432
}
416-
CuSubArray<int> idx(csr_row_ptr_col_idx_, NumRows() + 1 + NumElements());
417-
std::vector<int> idx_cpu;
418-
idx.CopyToVec(&idx_cpu);
433+
CuSubArray<int> row_ptr(csr_row_ptr_, NumRows() + 1);
434+
std::vector<int> row_ptr_cpu;
435+
row_ptr.CopyToVec(&row_ptr_cpu);
436+
437+
438+
CuSubArray<int> col_idx(csr_col_idx_, NumElements());
439+
std::vector<int> col_idx_cpu;
440+
col_idx.CopyToVec(&col_idx_cpu);
419441

420442
CuSubVector<Real> val(CsrVal(), NumElements());
421443
Vector<OtherReal> val_cpu(NumElements(), kUndefined);
@@ -425,8 +447,8 @@ void CuSparseMatrix<Real>::CopyToSmat(SparseMatrix<OtherReal> *smat) const {
425447
NumRows());
426448
int n = 0;
427449
for (int i = 0; i < NumRows(); ++i) {
428-
for (; n < idx_cpu[i + 1]; ++n) {
429-
const MatrixIndexT j = idx_cpu[NumRows() + 1 + n];
450+
for (; n < row_ptr_cpu[i + 1]; ++n) {
451+
const MatrixIndexT j = col_idx_cpu[n];
430452
pairs[i].push_back( { j, val_cpu(n) });
431453
}
432454
}
@@ -484,7 +506,8 @@ void CuSparseMatrix<Real>::Swap(CuSparseMatrix<Real> *smat) {
484506
std::swap(num_rows_, smat->num_rows_);
485507
std::swap(num_cols_, smat->num_cols_);
486508
std::swap(nnz_, smat->nnz_);
487-
std::swap(csr_row_ptr_col_idx_, smat->csr_row_ptr_col_idx_);
509+
std::swap(csr_row_ptr_, smat->csr_row_ptr_);
510+
std::swap(csr_col_idx_, smat->csr_col_idx_);
488511
std::swap(csr_val_, smat->csr_val_);
489512
} else
490513
#endif

0 commit comments

Comments
 (0)