Skip to content

Commit fc5c63c

Browse files
author
Yihan Wang
authored
[SYCLomatic] Support migration of cub::{LoadDirectBlocked, LoadDirectStriped} (#2094)
Signed-off-by: Wang, Yihan <[email protected]>
1 parent cd0f835 commit fc5c63c

File tree

5 files changed

+76
-4
lines changed

5 files changed

+76
-4
lines changed

clang/lib/DPCT/APINames_CUB.inc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -228,11 +228,11 @@ ENTRY_MEMBER_FUNCTION(cub::GridEvenShare, cub::GridEvenShare, BlockInit, BlockIn
228228
// Thread and thread block I/O
229229
ENTRY(cub::ThreadLoad, cub::ThreadLoad, true, NO_FLAG, P4, "Successful")
230230
ENTRY(cub::ThreadStore, cub::ThreadStore, true, NO_FLAG, P4, "Successful")
231-
ENTRY(cub::LoadDirectBlocked, cub::LoadDirectBlocked, false, NO_FLAG, P4, "Comment")
231+
ENTRY(cub::LoadDirectBlocked, cub::LoadDirectBlocked, true, NO_FLAG, P4, "Successful")
232232
ENTRY(cub::LoadDirectBlockedVectorized, cub::LoadDirectBlockedVectorized, false, NO_FLAG, P4, "Comment")
233233
ENTRY(cub::StoreDirectBlocked, cub::StoreDirectBlocked, false, NO_FLAG, P4, "Comment")
234234
ENTRY(cub::StoreDirectBlockedVectorized, cub::StoreDirectBlockedVectorized, false, NO_FLAG, P4, "Comment")
235-
ENTRY(cub::LoadDirectStriped, cub::LoadDirectStriped, false, NO_FLAG, P4, "Comment")
235+
ENTRY(cub::LoadDirectStriped, cub::LoadDirectStriped, true, NO_FLAG, P4, "Successful")
236236
ENTRY(cub::StoreDirectStriped, cub::StoreDirectStriped, false, NO_FLAG, P4, "Comment")
237237
ENTRY(cub::LoadDirectWarpStriped, cub::LoadDirectWarpStriped, false, NO_FLAG, P4, "Comment")
238238
ENTRY(cub::StoreDirectWarpStriped, cub::StoreDirectWarpStriped, false, NO_FLAG, P4, "Comment")

clang/lib/DPCT/CUBAPIMigration.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,8 @@ void CubIntrinsicRule::registerMatcher(ast_matchers::MatchFinder &MF) {
192192
"WarpId", "SyncStream", "CurrentDevice", "DeviceCount",
193193
"DeviceCountUncached", "DeviceCountCachedValue",
194194
"PtxVersion", "PtxVersionUncached", "SmVersion",
195-
"SmVersionUncached", "RowMajorTid"),
195+
"SmVersionUncached", "RowMajorTid",
196+
"LoadDirectBlocked", "LoadDirectStriped"),
196197
hasAncestor(namespaceDecl(hasName("cub")))))))
197198
.bind("IntrinsicCall"),
198199
this);

clang/lib/DPCT/Rewriters/CUB/RewriterUtilityFunctions.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,5 +114,19 @@ RewriterMap dpct::createUtilityFunctionsRewriterMap() {
114114
LITERAL("10"))))
115115
// cub::RowMajorTid
116116
MEMBER_CALL_FACTORY_ENTRY("cub::RowMajorTid", NDITEM, /*IsArrow=*/false,
117-
"get_local_linear_id")};
117+
"get_local_linear_id")
118+
// cub::LoadDirectBlocked
119+
HEADER_INSERT_FACTORY(
120+
HeaderType::HT_DPCT_GROUP_Utils,
121+
CALL_FACTORY_ENTRY(
122+
"cub::LoadDirectBlocked",
123+
CALL(MapNames::getDpctNamespace() + "group::load_direct_blocked",
124+
NDITEM, ARG(1), ARG(2))))
125+
// cub::LoadDirectStriped
126+
HEADER_INSERT_FACTORY(
127+
HeaderType::HT_DPCT_GROUP_Utils,
128+
CALL_FACTORY_ENTRY(
129+
"cub::LoadDirectStriped",
130+
CALL(MapNames::getDpctNamespace() + "group::load_direct_striped",
131+
NDITEM, ARG(1), ARG(2))))};
118132
}

clang/runtime/dpct-rt/include/dpct/group_utils.hpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,6 +476,41 @@ __dpct_inline__ void load_striped(const Item &item, InputIteratorT block_itr,
476476
}
477477
}
478478

479+
// loads a linear segment of workgroup items into a blocked arrangement.
480+
template <typename InputT, size_t ITEMS_PER_WORK_ITEM, typename InputIteratorT,
481+
typename Item>
482+
__dpct_inline__ void load_direct_blocked(const Item &item, InputIteratorT block_itr,
483+
InputT (&items)[ITEMS_PER_WORK_ITEM]) {
484+
485+
// This implementation does not take in account range loading across
486+
// workgroup items To-do: Decide whether range loading is required for group
487+
// loading
488+
size_t linear_tid = item.get_local_linear_id();
489+
uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM;
490+
#pragma unroll
491+
for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) {
492+
items[idx] = block_itr[workgroup_offset + idx];
493+
}
494+
}
495+
496+
// loads a linear segment of workgroup items into a striped arrangement.
497+
template <typename InputT, size_t ITEMS_PER_WORK_ITEM, typename InputIteratorT,
498+
typename Item>
499+
__dpct_inline__ void load_direct_striped(const Item &item, InputIteratorT block_itr,
500+
InputT (&items)[ITEMS_PER_WORK_ITEM]) {
501+
502+
// This implementation does not take in account range loading across
503+
// workgroup items To-do: Decide whether range loading is required for group
504+
// loading
505+
size_t linear_tid = item.get_local_linear_id();
506+
size_t group_work_items = item.get_local_range().size();
507+
#pragma unroll
508+
for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) {
509+
items[idx] = block_itr[linear_tid + (idx * group_work_items)];
510+
}
511+
}
512+
513+
479514
// loads a linear segment of workgroup items into a subgroup striped
480515
// arrangement. Created as free function until exchange mechanism is
481516
// implemented.
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2
2+
// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2
3+
// RUN: dpct -in-root %S -out-root %T/intrinsic/load %S/load.cu --cuda-include-path="%cuda-path/include" -- -std=c++14 -x cuda --cuda-host-only
4+
// RUN: FileCheck --input-file %T/intrinsic/load/load.dp.cpp --match-full-lines %s
5+
// RUN: %if build_lit %{icpx -c -fsycl %T/intrinsic/load/load.dp.cpp -o %T/intrinsic/load/load.dp.o %}
6+
7+
// CHECK:#include <sycl/sycl.hpp>
8+
// CHECK:#include <dpct/dpct.hpp>
9+
// CHECK:#include <dpct/group_utils.hpp>
10+
#include <cub/cub.cuh>
11+
12+
__global__ void TestLoadStriped(int *d_data) {
13+
int thread_data[4];
14+
// CHECK: dpct::group::load_direct_striped(item_ct1, d_data, thread_data);
15+
cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
16+
}
17+
18+
__global__ void BlockedToStripedKernel(int *d_data) {
19+
int thread_data[4];
20+
// CHECK: dpct::group::load_direct_blocked(item_ct1, d_data, thread_data);
21+
cub::LoadDirectBlocked(threadIdx.x, d_data, thread_data);
22+
}

0 commit comments

Comments
 (0)