Skip to content

Commit 96b9f6d

Browse files
Write comments on algorithms (#1158)
Co-authored-by: Stephen Nicholas Swatman <[email protected]>
1 parent 4314df2 commit 96b9f6d

File tree

7 files changed

+123
-37
lines changed

7 files changed

+123
-37
lines changed

device/cuda/src/ambiguity_resolution/greedy_ambiguity_resolution_algorithm.cu

Lines changed: 98 additions & 32 deletions
Large diffs are not rendered by default.

device/cuda/src/ambiguity_resolution/kernels/add_block_offset.cu

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ __global__ void add_block_offset(device::add_block_offset_payload payload) {
3131
return;
3232
}
3333

34+
// Add the scanned block offsets to block-wise prefix sums of the number of
35+
// updated tracks.
3436
prefix_sums[globalIndex] += block_offsets[blockIdx.x - 1];
3537
}
3638

device/cuda/src/ambiguity_resolution/kernels/block_inclusive_scan.cu

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ __global__ void block_inclusive_scan(
2222
return;
2323
}
2424

25+
// temporary buffer where the block-wise prefix sum will be calculated
2526
extern __shared__ int shared_temp[];
2627

2728
vecmem::device_vector<const unsigned int> sorted_ids(
@@ -37,13 +38,16 @@ __global__ void block_inclusive_scan(
3738

3839
if (globalIndex >= n_accepted) {
3940
shared_temp[threadIndex] = 0;
40-
} else {
41+
}
42+
// Start with boolean number depending on whether track id corresponding to
43+
// the current thread is updated during the iteration
44+
else {
4145
shared_temp[threadIndex] = is_updated[sorted_ids[globalIndex]];
4246
}
4347

4448
__syncthreads();
4549

46-
// inclusive scan in shared memory
50+
// Inclusive scan the boolean numbers to calculate the block-wise prefix sum
4751
for (int stride = 1; stride < blockDim.x; stride *= 2) {
4852
int val = 0;
4953
if (threadIndex >= stride) {
@@ -56,12 +60,15 @@ __global__ void block_inclusive_scan(
5660
__syncthreads();
5761
}
5862

63+
// Move the block-wise prefix_sums to global memory
5964
if (globalIndex < n_accepted) {
6065
prefix_sums[globalIndex] = shared_temp[threadIndex];
6166
}
6267

6368
__syncthreads();
6469

70+
// Block offset, the last element of block-wise prefix sums, is also
71+
// recorded to calculate full prefix sums later
6572
if (threadIndex == blockDim.x - 1) {
6673
block_offsets[blockIdx.x] = shared_temp[threadIndex];
6774
}

device/cuda/src/ambiguity_resolution/kernels/fill_inverted_ids.cu

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ __global__ void fill_inverted_ids(device::fill_inverted_ids_payload payload) {
3131
return;
3232
}
3333

34+
// Fill the inverted_ids vector which converts a track id to the index of
35+
// sorted ids
3436
inverted_ids[sorted_ids[globalIndex]] = globalIndex;
3537
}
3638

device/cuda/src/ambiguity_resolution/kernels/rearrange_tracks.cu

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,9 @@ __launch_bounds__(1024) __global__
9696
int ini_idx = stride * (threadIdx.x % nThreads_per_track);
9797
int fin_idx = std::min(ini_idx + stride, static_cast<int>(N));
9898

99+
// If it is an updated track, find new sorted index by using the binary
100+
// search. The index is also shifted by using the bitonic sort result
101+
// from sort_updated_tracks and prefix sums
99102
if (is_updated[tid]) {
100103

101104
if (gid > 0) {
@@ -188,7 +191,10 @@ __launch_bounds__(1024) __global__
188191
if (offset != 0) {
189192
atomicAdd(&shifted_idx, offset);
190193
}
191-
} else {
194+
}
195+
// If it is not an updated track, it is enough to count the number of
196+
// updated tracks which need to come earlier.
197+
else {
192198

193199
for (int i = ini_idx; i < fin_idx; i++) {
194200

@@ -209,6 +215,7 @@ __launch_bounds__(1024) __global__
209215

210216
__syncthreads();
211217

218+
// Save the result of new indices into a temporary buffer
212219
if (is_valid_thread && (threadIdx.x % nThreads_per_track) == 0) {
213220
temp_sorted_ids.at(shifted_idx) = tid;
214221
}

device/cuda/src/ambiguity_resolution/kernels/scan_block_offsets.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ __global__ void scan_block_offsets(device::scan_block_offsets_payload payload) {
3939
}
4040
__syncthreads();
4141

42-
// 2. Inclusive scan (Hillis-Steele style)
42+
// 2. Inclusive scan to caculated the scanned block offset which is the
43+
// prefix sum of block offsets
4344
for (int offset = 1; offset < n_blocks_prev; offset *= 2) {
4445
int temp = 0;
4546
if (threadIndex >= offset) {

device/cuda/src/ambiguity_resolution/kernels/sort_updated_tracks.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ __launch_bounds__(512) __global__
3232

3333
const unsigned int tid = threadIdx.x;
3434

35-
// Load to shared memory
35+
// Load updated track indices into shared memory (for sorting)
3636
shared_mem_tracks[tid] = std::numeric_limits<unsigned int>::max();
3737

3838
if (tid < *(payload.n_updated_tracks)) {
@@ -90,6 +90,7 @@ __launch_bounds__(512) __global__
9090
}
9191
}
9292

93+
// Write back the sorted result from shared memory to global memory
9394
if (tid < *(payload.n_updated_tracks)) {
9495
updated_tracks[tid] = shared_mem_tracks[tid];
9596
}

0 commit comments

Comments
 (0)