-
Notifications
You must be signed in to change notification settings - Fork 96
[SYCLomatic] Block Load headers core #1640
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 68 commits
bde928e
8717079
1549642
41d994e
99df7d9
789bd18
7872841
ec7a718
fe4c38e
a4e2316
1a0b447
c5e4fad
95edd0e
1419253
6f99026
eb5539a
8ce2b68
871c6c2
e8fc26e
0f7b5e4
93db62a
2d78e9a
1276698
8d43351
3d22cd7
0b32a44
e24ebb6
293bf14
721e722
a164256
0dc3fa0
59b881e
b6f123c
c0d96f5
5436755
118bcc1
7060821
aa6268a
48677b9
2e66d3c
70d5d27
7e50327
73aab25
f97e665
b90f7d9
fc0ce87
c788856
d4ce0b1
29d4405
a6a85ff
6ffd681
ee45991
7c8111d
a406b15
c4e125c
c3bc942
69dbddc
49f5d85
b1d8d70
7cefbf8
fdc2f2f
95db67f
141ace7
540db29
ebf6237
7f9d4e6
cb87b67
71d4047
bd24713
26a3ae2
6222566
236c7ba
89cf7d3
c4f0ca0
ec4f8ae
d382f60
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -566,6 +566,106 @@ class radix_sort { | |
uint8_t *_local_memory; | ||
}; | ||
|
||
/// Load linear segment items into block format across threads | ||
/// Helper for Block Load | ||
enum load_algorithm { | ||
|
||
BLOCK_LOAD_DIRECT, | ||
BLOCK_LOAD_STRIPED, | ||
// To-do: BLOCK_LOAD_WARP_TRANSPOSE | ||
|
||
}; | ||
|
||
// loads a linear segment of workgroup items into a blocked arrangement. | ||
template <size_t ITEMS_PER_WORK_ITEM, typename InputT, typename InputIteratorT, | ||
typename Item> | ||
__dpct_inline__ void load_blocked(const Item &item, InputIteratorT block_itr, | ||
InputT (&items)[ITEMS_PER_WORK_ITEM]) { | ||
|
||
// This implementation does not take in account range loading across | ||
// workgroup items To-do: Decide whether range loading is required for group | ||
// loading | ||
size_t linear_tid = item.get_local_linear_id(); | ||
uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM; | ||
#pragma unroll | ||
for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { | ||
items[idx] = block_itr[workgroup_offset + idx]; | ||
} | ||
} | ||
|
||
// loads a linear segment of workgroup items into a striped arrangement. | ||
template <size_t ITEMS_PER_WORK_ITEM, typename InputT, typename InputIteratorT, | ||
typename Item> | ||
__dpct_inline__ void load_striped(const Item &item, InputIteratorT block_itr, | ||
InputT (&items)[ITEMS_PER_WORK_ITEM]) { | ||
|
||
// This implementation does not take in account range loading across | ||
// workgroup items To-do: Decide whether range loading is required for group | ||
// loading | ||
size_t linear_tid = item.get_local_linear_id(); | ||
size_t group_work_items = item.get_local_range().size(); | ||
#pragma unroll | ||
for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { | ||
items[idx] = block_itr[linear_tid + (idx * group_work_items)]; | ||
} | ||
} | ||
|
||
// loads a linear segment of workgroup items into a subgroup striped | ||
// arrangement. Created as free function until exchange mechanism is | ||
// implemented. | ||
// To-do: inline this function with BLOCK_LOAD_WARP_TRANSPOSE mechanism | ||
template <size_t ITEMS_PER_WORK_ITEM, typename InputT, typename InputIteratorT, | ||
typename Item> | ||
__dpct_inline__ void | ||
uninitialized_load_subgroup_striped(const Item &item, InputIteratorT block_itr, | ||
InputT (&items)[ITEMS_PER_WORK_ITEM]) { | ||
|
||
// This implementation does not take in account range loading across | ||
// workgroup items To-do: Decide whether range loading is required for group | ||
// loading | ||
// This implementation uses unintialized memory for loading linear segments | ||
// into warp striped arrangement. | ||
uint32_t subgroup_offset = item.get_sub_group().get_local_linear_id(); | ||
uint32_t subgroup_size = item.get_sub_group().get_local_linear_range(); | ||
uint32_t subgroup_idx = item.get_sub_group().get_group_linear_id(); | ||
uint32_t initial_offset = | ||
(subgroup_idx * ITEMS_PER_WORK_ITEM * subgroup_size) + subgroup_offset; | ||
#pragma unroll | ||
for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { | ||
new (&items[idx]) InputT(block_itr[initial_offset + (idx * subgroup_size)]); | ||
} | ||
} | ||
// template parameters : | ||
// ITEMS_PER_WORK_ITEM: size_t varaiable controlling the number of items per | ||
abhilash1910 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// thread/work_item | ||
// ALGORITHM: load_algorithm variable controlling the type of load operation. | ||
// InputT: typename parameter controlled at runtime from input sequence. | ||
abhilash1910 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// InputIteratorT: typename parameter for iterator pointer controlled at | ||
abhilash1910 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// runtime. | ||
// Item : typename parameter resembling sycl::nd_item<3> . | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we rename to nd_item? (what is the diff b/w Item and sycl::nd_item) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes it is same thing, but to maintain the same variable name across the file (similar for other apis), Item is used. |
||
template <size_t ITEMS_PER_WORK_ITEM, load_algorithm ALGORITHM, typename InputT, | ||
typename InputIteratorT, typename Item> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here are quite many item usage, ITEM/Item/iterms/ITERMS_PER_WORK_ITEM, maybe it should give a better name for "typename Item", it is a little confusing. or please add comments. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks will add comments to clarify. |
||
class workgroup_load { | ||
public: | ||
static size_t get_local_memory_size(size_t group_work_items) { return 0; } | ||
workgroup_load(uint8_t *local_memory) : _local_memory(local_memory) {} | ||
|
||
__dpct_inline__ void load(const Item &item, InputIteratorT block_itr, | ||
InputT (&items)[ITEMS_PER_WORK_ITEM]) { | ||
|
||
if constexpr (ALGORITHM == BLOCK_LOAD_DIRECT) { | ||
load_blocked<ITEMS_PER_WORK_ITEM>(item, block_itr, | ||
(&items)[ITEMS_PER_WORK_ITEM]); | ||
} else if constexpr (ALGORITHM == BLOCK_LOAD_STRIPED) { | ||
load_striped<ITEMS_PER_WORK_ITEM>(item, block_itr, | ||
(&items)[ITEMS_PER_WORK_ITEM]); | ||
} | ||
} | ||
|
||
private: | ||
uint8_t *_local_memory; | ||
danhoeflinger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
}; | ||
|
||
/// Perform a reduction of the data elements assigned to all threads in the | ||
/// group. | ||
/// | ||
|
Uh oh!
There was an error while loading. Please reload this page.