44//! thread blocks and execute in SIMT fashion.
55
66use crate :: gpu_only;
7+ #[ cfg( target_os = "cuda" ) ]
78use core:: arch:: asm;
89use half:: { bf16, f16} ;
910
@@ -329,7 +330,7 @@ unsafe fn match_all_64(mask: u32, value: u64) -> (u32, bool) {
329330/// Behavior is undefined if:
330331/// - Any thread participating in the vote has exited or the executing thread is not in `mask`.
331332/// - For `compute_62` and below, all threads in `mask` must call this function in convergence, and only threads belonging
332- /// to the `mask` can be active when the intrinsic is called.
333+ /// to the `mask` can be active when the intrinsic is called.
333334/// - A thread tries to execute this function while not being present in `mask`.
334335#[ gpu_only]
335336pub unsafe fn warp_vote_all ( mask : u32 , predicate : bool ) -> bool {
@@ -359,7 +360,7 @@ pub unsafe fn warp_vote_all(mask: u32, predicate: bool) -> bool {
359360/// Behavior is undefined if:
360361/// - Any thread participating in the vote has exited or the executing thread is not in `mask`.
361362/// - For `compute_62` and below, all threads in `mask` must call this function in convergence, and only threads belonging
362- /// to the `mask` can be active when the intrinsic is called.
363+ /// to the `mask` can be active when the intrinsic is called.
363364/// - A thread tries to execute this function while not being present in `mask`.
364365#[ gpu_only]
365366pub unsafe fn warp_vote_any ( mask : u32 , predicate : bool ) -> bool {
@@ -389,7 +390,7 @@ pub unsafe fn warp_vote_any(mask: u32, predicate: bool) -> bool {
389390/// Behavior is undefined if:
390391/// - Any thread participating in the vote has exited or the executing thread is not in `mask`.
391392/// - For `compute_62` and below, all threads in `mask` must call this function in convergence, and only threads belonging
392- /// to the `mask` can be active when the intrinsic is called.
393+ /// to the `mask` can be active when the intrinsic is called.
393394/// - A thread tries to execute this function while not being present in `mask`.
394395#[ gpu_only]
395396pub unsafe fn warp_vote_ballot ( mask : u32 , predicate : bool ) -> u32 {
@@ -415,10 +416,10 @@ pub unsafe fn warp_vote_ballot(mask: u32, predicate: bool) -> u32 {
415416///
416417/// - `mask` dictates what threads will participate in the shuffle, usually [`u32::MAX`] to indicate all threads.
417418/// - `value` is the value that will be shuffled across the threads. i.e. the value that will be given to the thread
418- /// that calculates this thread as its target lane.
419+ /// that calculates this thread as its target lane.
419420/// - `delta` is the value that will be subtracted from the current thread's lane to calculate the target lane.
420421/// - `width` dictates how to optionally split the warp into subsections, it must be a power of two and lower than `32`.
421- /// calculated source lane values will NOT wrap around the value of `width`. Usually just `32`.
422+ /// calculated source lane values will NOT wrap around the value of `width`. Usually just `32`.
422423///
423424/// # Returns
424425///
@@ -439,7 +440,7 @@ pub unsafe fn warp_vote_ballot(mask: u32, predicate: bool) -> u32 {
439440/// Behavior is undefined if:
440441/// - Any thread participating in the shuffle has exited or the executing thread is not in `mask`.
441442/// - For `compute_62` and below, all threads in `mask` must call the same function in convergence, and only the threads
442- /// in `mask` can be active when the shuffle is called.
443+ /// in `mask` can be active when the shuffle is called.
443444///
444445/// The returned value returned is unspecified if the calculated target lane is inactive.
445446pub unsafe fn warp_shuffle_down < T : WarpShuffleValue > (
@@ -457,10 +458,10 @@ pub unsafe fn warp_shuffle_down<T: WarpShuffleValue>(
457458///
458459/// - `mask` dictates what threads will participate in the shuffle, usually [`u32::MAX`] to indicate all threads.
459460/// - `value` is the value that will be shuffled across the threads. i.e. the value that will be given to the thread
460- /// that calculates this thread as its target lane.
461+ /// that calculates this thread as its target lane.
461462/// - `delta` is the value that will be added to the current thread's lane to calculate the target lane.
462463/// - `width` dictates how to optionally split the warp into subsections, it must be a power of two and lower than `32`.
463- /// calculated source lane values will NOT wrap around the value of `width`. Usually just `32`.
464+ /// calculated source lane values will NOT wrap around the value of `width`. Usually just `32`.
464465///
465466/// # Returns
466467///
@@ -481,7 +482,7 @@ pub unsafe fn warp_shuffle_down<T: WarpShuffleValue>(
481482/// Behavior is undefined if:
482483/// - Any thread participating in the shuffle has exited or the executing thread is not in `mask`.
483484/// - For `compute_62` and below, all threads in `mask` must call the same function in convergence, and only the threads
484- /// in `mask` can be active when the shuffle is called.
485+ /// in `mask` can be active when the shuffle is called.
485486///
486487/// The returned value returned is unspecified if the calculated target lane is inactive.
487488pub unsafe fn warp_shuffle_up < T : WarpShuffleValue > (
@@ -499,10 +500,10 @@ pub unsafe fn warp_shuffle_up<T: WarpShuffleValue>(
499500///
500501/// - `mask` dictates what threads will participate in the shuffle, usually [`u32::MAX`] to indicate all threads.
501502/// - `value` is the value that will be shuffled across the threads. i.e. the value that will be given to the thread
502- /// that calculates this thread as its target lane.
503+ /// that calculates this thread as its target lane.
503504/// - `idx` is the target lane that will be used as the source of this thread's returned value.
504505/// - `width` dictates how to optionally split the warp into subsections, it must be a power of two and lower than `32`.
505- /// calculated source lane values will NOT wrap around the value of `width`. Usually just `32`.
506+ /// calculated source lane values will NOT wrap around the value of `width`. Usually just `32`.
506507///
507508/// # Returns
508509///
@@ -523,7 +524,7 @@ pub unsafe fn warp_shuffle_up<T: WarpShuffleValue>(
523524/// Behavior is undefined if:
524525/// - Any thread participating in the shuffle has exited or the executing thread is not in `mask`.
525526/// - For `compute_62` and below, all threads in `mask` must call the same function in convergence, and only the threads
526- /// in `mask` can be active when the shuffle is called.
527+ /// in `mask` can be active when the shuffle is called.
527528///
528529/// The returned value returned is unspecified if the calculated target lane is inactive.
529530pub unsafe fn warp_shuffle_idx < T : WarpShuffleValue > (
@@ -541,11 +542,11 @@ pub unsafe fn warp_shuffle_idx<T: WarpShuffleValue>(
541542///
542543/// - `mask` dictates what threads will participate in the shuffle, usually [`u32::MAX`] to indicate all threads.
543544/// - `value` is the value that will be shuffled across the threads. i.e. the value that will be given to the thread
544- /// that calculates this thread as its target lane.
545+ /// that calculates this thread as its target lane.
545546/// - `lane_mask` is the value that will be XOR'd by the current thread's lane id to calculate the target lane. i.e. the
546- /// target lane will be `lane_id ^ lane_mask`.
547+ /// target lane will be `lane_id ^ lane_mask`.
547548/// - `width` dictates how to optionally split the warp into subsections, it must be a power of two and lower than `32`.
548- /// calculated source lane values will NOT wrap around the value of `width`. Usually just `32`.
549+ /// calculated source lane values will NOT wrap around the value of `width`. Usually just `32`.
549550///
550551/// # Returns
551552///
@@ -566,7 +567,7 @@ pub unsafe fn warp_shuffle_idx<T: WarpShuffleValue>(
566567/// Behavior is undefined if:
567568/// - Any thread participating in the shuffle has exited or the executing thread is not in `mask`.
568569/// - For `compute_62` and below, all threads in `mask` must call the same function in convergence, and only the threads
569- /// in `mask` can be active when the shuffle is called.
570+ /// in `mask` can be active when the shuffle is called.
570571///
571572/// The returned value returned is unspecified if the calculated target lane is inactive.
572573pub unsafe fn warp_shuffle_xor < T : WarpShuffleValue > (
0 commit comments