Skip to content

Commit 578d64b

Browse files
Fix tiny problem sizes for warpspeed scan (#7921)
1 parent ed1e146 commit 578d64b

File tree

3 files changed

+8
-2
lines changed

3 files changed

+8
-2
lines changed

cub/cub/detail/warpspeed/squad/load_store.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ squadStoreBulkSync(Squad squad, CpAsyncOobInfo<OutputT> cpAsyncOobInfo, const ::
236236

237237
constexpr ::cuda::std::uint16_t byteMask = 0xFFFF;
238238
const ::cuda::std::uint16_t byteMaskStart = byteMask << cpAsyncOobInfo.smemStartSkipBytes;
239-
const ::cuda::std::uint16_t byteMaskEnd = byteMask >> (16 - cpAsyncOobInfo.smemEndBytesAfter16BBoundary);
239+
const ::cuda::std::uint16_t byteMaskEnd = byteMask >> (16 - cpAsyncOobInfo.smemEndBytesAfter16BBoundary) % 16;
240240
// byteMaskStart contains zeroes at the left
241241
# if _CCCL_CUDA_COMPILER(NVCC, >=, 13, 2)
242242
const ::cuda::std::uint16_t byteMaskSmall = byteMaskStart & byteMaskEnd;

cub/cub/device/dispatch/dispatch_scan.cuh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,12 @@ struct DispatchScan
410410
template <typename ActivePolicyT>
411411
CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t __invoke_lookahead_algorithm(ActivePolicyT)
412412
{
413+
if (num_items == 0)
414+
{
415+
temp_storage_bytes = 1; // just fulfill the contract that CUB always requires some temporary storage
416+
return cudaSuccess;
417+
}
418+
413419
using InputT = ::cuda::std::iter_value_t<InputIteratorT>;
414420
using OutputT = ::cuda::std::iter_value_t<OutputIteratorT>;
415421
using WarpspeedPolicy = typename ActivePolicyT::WarpspeedPolicy;

cub/test/catch2_test_device_scan_alignment.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ C2H_TEST("Device scan works with all device interfaces", "[scan][device]", value
3939
constexpr offset_t max_num_items = 8192;
4040

4141
const auto offset = GENERATE_COPY(values({0, 1, 3, 4, 7, 8, 11, 12, 16}), take(3, random(0, max_offset)));
42-
const auto num_items = GENERATE_COPY(values({1, max_num_items}), take(64, random(0, max_num_items)));
42+
const auto num_items = GENERATE_COPY(values({0, 1, max_num_items}), take(64, random(2, max_num_items - 1)));
4343

4444
CAPTURE(num_items, offset);
4545

0 commit comments

Comments
 (0)