Write comments on algorithms (#1158)

beomki-yeo · stephenswat · web-flow · commit 96b9f6dfdc02 · 2025-10-15T12:00:18.000Z
Co-authored-by: Stephen Nicholas Swatman &lt;stephen.nicholas.swatman@cern.ch&gt;
diff --git a/device/cuda/src/ambiguity_resolution/greedy_ambiguity_resolution_algorithm.cu b/device/cuda/src/ambiguity_resolution/greedy_ambiguity_resolution_algorithm.cu
diff --git a/device/cuda/src/ambiguity_resolution/kernels/add_block_offset.cu b/device/cuda/src/ambiguity_resolution/kernels/add_block_offset.cu
@@ -31,6 +31,8 @@ __global__ void add_block_offset(device::add_block_offset_payload payload) {
         return;
     }
 
+    // Add the scanned block offsets to block-wise prefix sums of the number of
+    // updated tracks.
     prefix_sums[globalIndex] += block_offsets[blockIdx.x - 1];
 }
 
diff --git a/device/cuda/src/ambiguity_resolution/kernels/block_inclusive_scan.cu b/device/cuda/src/ambiguity_resolution/kernels/block_inclusive_scan.cu
@@ -22,6 +22,7 @@ __global__ void block_inclusive_scan(
         return;
     }
 
+    // temporary buffer where the block-wise prefix sum will be calculated
     extern __shared__ int shared_temp[];
 
     vecmem::device_vector<const unsigned int> sorted_ids(
@@ -37,13 +38,16 @@ __global__ void block_inclusive_scan(
 
     if (globalIndex >= n_accepted) {
         shared_temp[threadIndex] = 0;
-    } else {
+    }
+    // Start with boolean number depending on whether track id corresponding to
+    // the current thread is updated during the iteration
+    else {
         shared_temp[threadIndex] = is_updated[sorted_ids[globalIndex]];
     }
 
     __syncthreads();
 
-    // inclusive scan in shared memory
+    // Inclusive scan the boolean numbers to calculate the block-wise prefix sum
     for (int stride = 1; stride < blockDim.x; stride *= 2) {
         int val = 0;
         if (threadIndex >= stride) {
@@ -56,12 +60,15 @@ __global__ void block_inclusive_scan(
         __syncthreads();
     }
 
+    // Move the block-wise prefix_sums to global memory
     if (globalIndex < n_accepted) {
         prefix_sums[globalIndex] = shared_temp[threadIndex];
     }
 
     __syncthreads();
 
+    // Block offset, the last element of block-wise prefix sums, is also
+    // recorded to calculate full prefix sums later
     if (threadIndex == blockDim.x - 1) {
         block_offsets[blockIdx.x] = shared_temp[threadIndex];
     }
diff --git a/device/cuda/src/ambiguity_resolution/kernels/fill_inverted_ids.cu b/device/cuda/src/ambiguity_resolution/kernels/fill_inverted_ids.cu
@@ -31,6 +31,8 @@ __global__ void fill_inverted_ids(device::fill_inverted_ids_payload payload) {
         return;
     }
 
+    // Fill the inverted_ids vector which converts a track id to the index of
+    // sorted ids
     inverted_ids[sorted_ids[globalIndex]] = globalIndex;
 }
 
diff --git a/device/cuda/src/ambiguity_resolution/kernels/rearrange_tracks.cu b/device/cuda/src/ambiguity_resolution/kernels/rearrange_tracks.cu
@@ -96,6 +96,9 @@ __launch_bounds__(1024) __global__
         int ini_idx = stride * (threadIdx.x % nThreads_per_track);
         int fin_idx = std::min(ini_idx + stride, static_cast<int>(N));
 
+        // If it is an updated track, find new sorted index by using the binary
+        // search. The index is also shifted by using the bitonic sort result
+        // from sort_updated_tracks and prefix sums
         if (is_updated[tid]) {
 
             if (gid > 0) {
@@ -188,7 +191,10 @@ __launch_bounds__(1024) __global__
             if (offset != 0) {
                 atomicAdd(&shifted_idx, offset);
             }
-        } else {
+        }
+        // If it is not an updated track, it is enough to count the number of
+        // updated tracks which need to come earlier.
+        else {
 
             for (int i = ini_idx; i < fin_idx; i++) {
 
@@ -209,6 +215,7 @@ __launch_bounds__(1024) __global__
 
     __syncthreads();
 
+    // Save the result of new indices into a temporary buffer
     if (is_valid_thread && (threadIdx.x % nThreads_per_track) == 0) {
         temp_sorted_ids.at(shifted_idx) = tid;
     }
diff --git a/device/cuda/src/ambiguity_resolution/kernels/scan_block_offsets.cu b/device/cuda/src/ambiguity_resolution/kernels/scan_block_offsets.cu
@@ -39,7 +39,8 @@ __global__ void scan_block_offsets(device::scan_block_offsets_payload payload) {
     }
     __syncthreads();
 
-    // 2. Inclusive scan (Hillis-Steele style)
+    // 2. Inclusive scan to caculated the scanned block offset which is the
+    // prefix sum of block offsets
     for (int offset = 1; offset < n_blocks_prev; offset *= 2) {
         int temp = 0;
         if (threadIndex >= offset) {
diff --git a/device/cuda/src/ambiguity_resolution/kernels/sort_updated_tracks.cu b/device/cuda/src/ambiguity_resolution/kernels/sort_updated_tracks.cu
@@ -32,7 +32,7 @@ __launch_bounds__(512) __global__
 
     const unsigned int tid = threadIdx.x;
 
-    // Load to shared memory
+    // Load updated track indices into shared memory (for sorting)
     shared_mem_tracks[tid] = std::numeric_limits<unsigned int>::max();
 
     if (tid < *(payload.n_updated_tracks)) {
@@ -90,6 +90,7 @@ __launch_bounds__(512) __global__
         }
     }
 
+    // Write back the sorted result from shared memory to global memory
     if (tid < *(payload.n_updated_tracks)) {
         updated_tracks[tid] = shared_mem_tracks[tid];
     }

Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,8 @@ __global__ void add_block_offset(device::add_block_offset_payload payload) {`
`31`	`31`	`return;`
`32`	`32`	`}`
`33`	`33`
	`34`	`+ // Add the scanned block offsets to block-wise prefix sums of the number of`
	`35`	`+ // updated tracks.`
`34`	`36`	`prefix_sums[globalIndex] += block_offsets[blockIdx.x - 1];`
`35`	`37`	`}`
`36`	`38`
Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,8 @@ __global__ void fill_inverted_ids(device::fill_inverted_ids_payload payload) {`
`31`	`31`	`return;`
`32`	`32`	`}`
`33`	`33`
	`34`	`+ // Fill the inverted_ids vector which converts a track id to the index of`
	`35`	`+ // sorted ids`
`34`	`36`	`inverted_ids[sorted_ids[globalIndex]] = globalIndex;`
`35`	`37`	`}`
`36`	`38`
Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,8 @@ __global__ void scan_block_offsets(device::scan_block_offsets_payload payload) {`
`39`	`39`	`}`
`40`	`40`	`__syncthreads();`
`41`	`41`
`42`		`- // 2. Inclusive scan (Hillis-Steele style)`
	`42`	`+ // 2. Inclusive scan to caculated the scanned block offset which is the`
	`43`	`+ // prefix sum of block offsets`
`43`	`44`	`for (int offset = 1; offset < n_blocks_prev; offset *= 2) {`
`44`	`45`	`int temp = 0;`
`45`	`46`	`if (threadIndex >= offset) {`
Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ __launch_bounds__(512) __global__`
`32`	`32`
`33`	`33`	`const unsigned int tid = threadIdx.x;`
`34`	`34`
`35`		`- // Load to shared memory`
	`35`	`+ // Load updated track indices into shared memory (for sorting)`
`36`	`36`	`shared_mem_tracks[tid] = std::numeric_limits<unsigned int>::max();`
`37`	`37`
`38`	`38`	`if (tid < *(payload.n_updated_tracks)) {`
`@@ -90,6 +90,7 @@ __launch_bounds__(512) __global__`
`90`	`90`	`}`
`91`	`91`	`}`
`92`	`92`
	`93`	`+ // Write back the sorted result from shared memory to global memory`
`93`	`94`	`if (tid < *(payload.n_updated_tracks)) {`
`94`	`95`	`updated_tracks[tid] = shared_mem_tracks[tid];`
`95`	`96`	`}`