Skip to content
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -458,10 +458,55 @@ greedy_ambiguity_resolution_algorithm::operator()(
nThreads_scan = scan_dim.first;
nBlocks_scan = scan_dim.second;

/*
CUDA kernel sequence with multiple streams
├── reset_status
├── find_max_shared
├── remove_tracks
│ │
│ └── [record event_removal]
├── sort_updated_tracks (Main stream — executed after event_removal)
│ │
│ └── [record event_main]
├───▶ stream_fill
│ │
│ └── [wait for event_removal]
│ │
│ └── fill_inverted_ids
│ │
│ └── [record event_fill]
├───▶ stream_scan
│ │
│ └── [wait for event_removal]
│ │
│ ├── block_inclusive_scan
│ ├── scan_block_offsets
│ └── add_block_offset
│ │
│ └── [record event_scan]
├── [wait for event_main, event_fill, and event_scan] ← sync point
├── rearrange_tracks
└── gather_tracks
*/

// Make CUDA Graph
cudaGraph_t graph;
cudaGraphExec_t graphExec;

// Create stream for parallelizable kernels
cudaStream_t stream_fill, stream_scan;
cudaStreamCreateWithFlags(&stream_fill, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&stream_scan, cudaStreamNonBlocking);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not only do you create streams inside of the execute function of the algorithm (as opposed to its constructor), but you do this inside of the while-loop of the algorithm.

Plus I don't see any cudaStreamDestroy(...) statements.

Copy link
Contributor Author

@beomki-yeo beomki-yeo Aug 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. I added cudaStreamDestroy

I took stream_fill and stream_scan out of the loop so they are created and destroyed only once per event.
I don't know why you think creating and destroying the streams are expensive - Do you have any reliable reference on this? I believe it is a microsecond scale and the usual process time per event is already a few hundred millisecond. Unless you really need to save the <<0.1% of the performance, we might need to change this PR but I don't think it is worth that much 🤔


cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);

kernels::reset_status<<<1, 1, 0, stream>>>(device::reset_status_payload{
Expand Down Expand Up @@ -503,6 +548,17 @@ greedy_ambiguity_resolution_algorithm::operator()(
.is_updated_view = is_updated_buffer,
.n_valid_threads = n_valid_threads_device.get()});

// Record the event after remove_tracks
cudaEvent_t event_removal;
cudaEventCreate(&event_removal);
cudaEventRecord(event_removal, stream);

// Make stream_fill (fill_inverted_ids) and stream_scan
// (block_inclusive_scan, scan_block_offsets, add_block_offset) wait for
// the remove_tracks
cudaStreamWaitEvent(stream_fill, event_removal, 0);
cudaStreamWaitEvent(stream_scan, event_removal, 0);

// The seven kernels below are to keep sorted_ids sorted based on
// the relative shared measurements and pvalues. This can be reduced
// into thrust::sort():
Expand All @@ -529,7 +585,7 @@ greedy_ambiguity_resolution_algorithm::operator()(
});

kernels::fill_inverted_ids<<<nBlocks_adaptive, nThreads_adaptive, 0,
stream>>>(
stream_fill>>>(
device::fill_inverted_ids_payload{
.sorted_ids_view = sorted_ids_buffer,
.terminate = terminate_device.get(),
Expand All @@ -538,33 +594,50 @@ greedy_ambiguity_resolution_algorithm::operator()(
.inverted_ids_view = inverted_ids_buffer,
});

kernels::block_inclusive_scan<<<nBlocks_scan, nThreads_scan,
nThreads_scan * sizeof(int), stream>>>(
device::block_inclusive_scan_payload{
.sorted_ids_view = sorted_ids_buffer,
.terminate = terminate_device.get(),
.n_accepted = n_accepted_device.get(),
.n_updated_tracks = n_updated_tracks_device.get(),
.is_updated_view = is_updated_buffer,
.block_offsets_view = block_offsets_buffer,
.prefix_sums_view = prefix_sums_buffer});

kernels::scan_block_offsets<<<1, nBlocks_scan,
nBlocks_scan * sizeof(int), stream>>>(
kernels::
block_inclusive_scan<<<nBlocks_scan, nThreads_scan,
nThreads_scan * sizeof(int), stream_scan>>>(
device::block_inclusive_scan_payload{
.sorted_ids_view = sorted_ids_buffer,
.terminate = terminate_device.get(),
.n_accepted = n_accepted_device.get(),
.n_updated_tracks = n_updated_tracks_device.get(),
.is_updated_view = is_updated_buffer,
.block_offsets_view = block_offsets_buffer,
.prefix_sums_view = prefix_sums_buffer});

kernels::scan_block_offsets<<<
1, nBlocks_scan, nBlocks_scan * sizeof(int), stream_scan>>>(
device::scan_block_offsets_payload{
.terminate = terminate_device.get(),
.n_accepted = n_accepted_device.get(),
.n_updated_tracks = n_updated_tracks_device.get(),
.block_offsets_view = block_offsets_buffer,
.scanned_block_offsets_view = scanned_block_offsets_buffer});

kernels::add_block_offset<<<nBlocks_scan, nThreads_scan, 0, stream>>>(
device::add_block_offset_payload{
.terminate = terminate_device.get(),
.n_accepted = n_accepted_device.get(),
.n_updated_tracks = n_updated_tracks_device.get(),
.block_offsets_view = scanned_block_offsets_buffer,
.prefix_sums_view = prefix_sums_buffer});
kernels::
add_block_offset<<<nBlocks_scan, nThreads_scan, 0, stream_scan>>>(
device::add_block_offset_payload{
.terminate = terminate_device.get(),
.n_accepted = n_accepted_device.get(),
.n_updated_tracks = n_updated_tracks_device.get(),
.block_offsets_view = scanned_block_offsets_buffer,
.prefix_sums_view = prefix_sums_buffer});

// Record the events
cudaEvent_t event_main, event_fill, event_scan;
cudaEventCreate(&event_main);
cudaEventCreate(&event_fill);
cudaEventCreate(&event_scan);

cudaEventRecord(event_main, stream);
cudaEventRecord(event_fill, stream_fill);
cudaEventRecord(event_scan, stream_scan);

// Synchronize the events with main stream
cudaStreamWaitEvent(stream, event_main, 0);
cudaStreamWaitEvent(stream, event_fill, 0);
cudaStreamWaitEvent(stream, event_scan, 0);
Comment on lines +644 to +656
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are completely equivalent to just:

TRACCC_CUDA_CHECK(cudaStreamSynchronize(stream));
TRACCC_CUDA_CHECK(cudaStreamSynchronize(stream_fill));
TRACCC_CUDA_CHECK(cudaStreamSynchronize(stream_scan));

Creating events makes sense if you need to pass such events between independent code blocks. Here you just want to synchronize on the stream(s).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TRACCC_CUDA_ERROR_CHECK instead of TRACCC_CUDA_CHECK

BTW, it is not obviously equivalent as I see a crash like the following:

terminate called after throwing an instance of 'vecmem::cuda::runtime_error'
  what():  /mnt/nvme0n1/byeo/projects/traccc/traccc_build/_deps/vecmem-src/cuda/src/memory/managed_memory_resource.cpp:51 Failed to execute: cudaFree(p) (operation not permitted when stream is capturing)
Aborted (core dumped)


kernels::rearrange_tracks<<<nBlocks_adaptive, nThreads_adaptive, 0,
stream>>>(device::rearrange_tracks_payload{
Expand Down
Loading