-
Notifications
You must be signed in to change notification settings - Fork 56
Use multiple streams to parallelize the kernels in the ambi-solver #1117
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -458,10 +458,55 @@ greedy_ambiguity_resolution_algorithm::operator()( | |
| nThreads_scan = scan_dim.first; | ||
| nBlocks_scan = scan_dim.second; | ||
|
|
||
| /* | ||
| CUDA kernel sequence with multiple streams | ||
| │ | ||
| ├── reset_status | ||
| │ | ||
| ├── find_max_shared | ||
| │ | ||
| ├── remove_tracks | ||
| │ │ | ||
| │ └── [record event_removal] | ||
| │ | ||
| ├── sort_updated_tracks (Main stream — executed after event_removal) | ||
| │ │ | ||
| │ └── [record event_main] | ||
| │ | ||
| ├───▶ stream_fill | ||
| │ │ | ||
| │ └── [wait for event_removal] | ||
| │ │ | ||
| │ └── fill_inverted_ids | ||
| │ │ | ||
| │ └── [record event_fill] | ||
| │ | ||
| ├───▶ stream_scan | ||
| │ │ | ||
| │ └── [wait for event_removal] | ||
| │ │ | ||
| │ ├── block_inclusive_scan | ||
| │ ├── scan_block_offsets | ||
| │ └── add_block_offset | ||
| │ │ | ||
| │ └── [record event_scan] | ||
| │ | ||
| ├── [wait for event_main, event_fill, and event_scan] ← sync point | ||
| │ | ||
| ├── rearrange_tracks | ||
| │ | ||
| └── gather_tracks | ||
| */ | ||
|
|
||
| // Make CUDA Graph | ||
| cudaGraph_t graph; | ||
| cudaGraphExec_t graphExec; | ||
|
|
||
| // Create stream for parallelizable kernels | ||
| cudaStream_t stream_fill, stream_scan; | ||
| cudaStreamCreateWithFlags(&stream_fill, cudaStreamNonBlocking); | ||
| cudaStreamCreateWithFlags(&stream_scan, cudaStreamNonBlocking); | ||
|
|
||
| cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal); | ||
|
|
||
| kernels::reset_status<<<1, 1, 0, stream>>>(device::reset_status_payload{ | ||
|
|
@@ -503,6 +548,17 @@ greedy_ambiguity_resolution_algorithm::operator()( | |
| .is_updated_view = is_updated_buffer, | ||
| .n_valid_threads = n_valid_threads_device.get()}); | ||
|
|
||
| // Record the event after remove_tracks | ||
| cudaEvent_t event_removal; | ||
| cudaEventCreate(&event_removal); | ||
| cudaEventRecord(event_removal, stream); | ||
|
|
||
| // Make stream_fill (fill_inverted_ids) and stream_scan | ||
| // (block_inclusive_scan, scan_block_offsets, add_block_offset) wait for | ||
| // the remove_tracks | ||
| cudaStreamWaitEvent(stream_fill, event_removal, 0); | ||
| cudaStreamWaitEvent(stream_scan, event_removal, 0); | ||
|
|
||
| // The seven kernels below are to keep sorted_ids sorted based on | ||
| // the relative shared measurements and pvalues. This can be reduced | ||
| // into thrust::sort(): | ||
|
|
@@ -529,7 +585,7 @@ greedy_ambiguity_resolution_algorithm::operator()( | |
| }); | ||
|
|
||
| kernels::fill_inverted_ids<<<nBlocks_adaptive, nThreads_adaptive, 0, | ||
| stream>>>( | ||
| stream_fill>>>( | ||
| device::fill_inverted_ids_payload{ | ||
| .sorted_ids_view = sorted_ids_buffer, | ||
| .terminate = terminate_device.get(), | ||
|
|
@@ -538,33 +594,50 @@ greedy_ambiguity_resolution_algorithm::operator()( | |
| .inverted_ids_view = inverted_ids_buffer, | ||
| }); | ||
|
|
||
| kernels::block_inclusive_scan<<<nBlocks_scan, nThreads_scan, | ||
| nThreads_scan * sizeof(int), stream>>>( | ||
| device::block_inclusive_scan_payload{ | ||
| .sorted_ids_view = sorted_ids_buffer, | ||
| .terminate = terminate_device.get(), | ||
| .n_accepted = n_accepted_device.get(), | ||
| .n_updated_tracks = n_updated_tracks_device.get(), | ||
| .is_updated_view = is_updated_buffer, | ||
| .block_offsets_view = block_offsets_buffer, | ||
| .prefix_sums_view = prefix_sums_buffer}); | ||
|
|
||
| kernels::scan_block_offsets<<<1, nBlocks_scan, | ||
| nBlocks_scan * sizeof(int), stream>>>( | ||
| kernels:: | ||
| block_inclusive_scan<<<nBlocks_scan, nThreads_scan, | ||
| nThreads_scan * sizeof(int), stream_scan>>>( | ||
| device::block_inclusive_scan_payload{ | ||
| .sorted_ids_view = sorted_ids_buffer, | ||
| .terminate = terminate_device.get(), | ||
| .n_accepted = n_accepted_device.get(), | ||
| .n_updated_tracks = n_updated_tracks_device.get(), | ||
| .is_updated_view = is_updated_buffer, | ||
| .block_offsets_view = block_offsets_buffer, | ||
| .prefix_sums_view = prefix_sums_buffer}); | ||
|
|
||
| kernels::scan_block_offsets<<< | ||
| 1, nBlocks_scan, nBlocks_scan * sizeof(int), stream_scan>>>( | ||
| device::scan_block_offsets_payload{ | ||
| .terminate = terminate_device.get(), | ||
| .n_accepted = n_accepted_device.get(), | ||
| .n_updated_tracks = n_updated_tracks_device.get(), | ||
| .block_offsets_view = block_offsets_buffer, | ||
| .scanned_block_offsets_view = scanned_block_offsets_buffer}); | ||
|
|
||
| kernels::add_block_offset<<<nBlocks_scan, nThreads_scan, 0, stream>>>( | ||
| device::add_block_offset_payload{ | ||
| .terminate = terminate_device.get(), | ||
| .n_accepted = n_accepted_device.get(), | ||
| .n_updated_tracks = n_updated_tracks_device.get(), | ||
| .block_offsets_view = scanned_block_offsets_buffer, | ||
| .prefix_sums_view = prefix_sums_buffer}); | ||
| kernels:: | ||
| add_block_offset<<<nBlocks_scan, nThreads_scan, 0, stream_scan>>>( | ||
| device::add_block_offset_payload{ | ||
| .terminate = terminate_device.get(), | ||
| .n_accepted = n_accepted_device.get(), | ||
| .n_updated_tracks = n_updated_tracks_device.get(), | ||
| .block_offsets_view = scanned_block_offsets_buffer, | ||
| .prefix_sums_view = prefix_sums_buffer}); | ||
|
|
||
| // Record the events | ||
| cudaEvent_t event_main, event_fill, event_scan; | ||
| cudaEventCreate(&event_main); | ||
| cudaEventCreate(&event_fill); | ||
| cudaEventCreate(&event_scan); | ||
|
|
||
| cudaEventRecord(event_main, stream); | ||
| cudaEventRecord(event_fill, stream_fill); | ||
| cudaEventRecord(event_scan, stream_scan); | ||
|
|
||
| // Synchronize the events with main stream | ||
| cudaStreamWaitEvent(stream, event_main, 0); | ||
| cudaStreamWaitEvent(stream, event_fill, 0); | ||
| cudaStreamWaitEvent(stream, event_scan, 0); | ||
|
Comment on lines
+644
to
+656
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are completely equivalent to just: TRACCC_CUDA_CHECK(cudaStreamSynchronize(stream));
TRACCC_CUDA_CHECK(cudaStreamSynchronize(stream_fill));
TRACCC_CUDA_CHECK(cudaStreamSynchronize(stream_scan));Creating events makes sense if you need to pass such events between independent code blocks. Here you just want to synchronize on the stream(s).
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
BTW, it is not obviously equivalent as I see a crash like the following: |
||
|
|
||
| kernels::rearrange_tracks<<<nBlocks_adaptive, nThreads_adaptive, 0, | ||
| stream>>>(device::rearrange_tracks_payload{ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not only do you create streams inside of the execute function of the algorithm (as opposed to its constructor), but you do this inside of the while-loop of the algorithm.
Plus I don't see any
cudaStreamDestroy(...)statements.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch. I added
cudaStreamDestroyI took
stream_fillandstream_scanout of the loop so they are created and destroyed only once per event.I don't know why you think creating and destroying the streams are expensive - Do you have any reliable reference on this? I believe it is a microsecond scale and the usual process time per event is already a few hundred millisecond. Unless you really need to save the <<0.1% of the performance, we might need to change this PR but I don't think it is worth that much 🤔