|
9 | 9 | #ifndef __DPCT_GROUP_UTILS_HPP__ |
10 | 10 | #define __DPCT_GROUP_UTILS_HPP__ |
11 | 11 |
|
| 12 | +#include <iterator> |
12 | 13 | #include <stdexcept> |
13 | 14 | #include <sycl/sycl.hpp> |
14 | 15 |
|
@@ -476,41 +477,59 @@ __dpct_inline__ void load_striped(const Item &item, InputIteratorT block_itr, |
476 | 477 | } |
477 | 478 | } |
478 | 479 |
|
479 | | -// loads a linear segment of workgroup items into a blocked arrangement. |
480 | | -template <typename InputT, size_t ITEMS_PER_WORK_ITEM, typename InputIteratorT, |
481 | | - typename Item> |
482 | | -__dpct_inline__ void load_direct_blocked(const Item &item, InputIteratorT block_itr, |
483 | | - InputT (&items)[ITEMS_PER_WORK_ITEM]) { |
484 | | - |
485 | | - // This implementation does not take in account range loading across |
486 | | - // workgroup items To-do: Decide whether range loading is required for group |
487 | | - // loading |
488 | | - size_t linear_tid = item.get_local_linear_id(); |
489 | | - uint32_t workgroup_offset = linear_tid * ITEMS_PER_WORK_ITEM; |
| 480 | +/// Load a linear segment of elements into a blocked arrangement across the |
| 481 | +/// work-group. |
| 482 | +/// |
| 483 | +/// \tparam InputT The data type to load. |
| 484 | +/// |
| 485 | +/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned |
| 486 | +/// onto each work-item. |
| 487 | +/// |
| 488 | +/// \tparam InputIteratorT The random-access iterator type for input \iterator. |
| 489 | +/// |
| 490 | +/// \param linear_tid A suitable linear identifier for the calling work-item. |
| 491 | +/// |
| 492 | +/// \param block_itr The work-group's base input iterator for loading from. |
| 493 | +/// |
| 494 | +/// \param items Data to load |
| 495 | +template <typename InputT, size_t ElementsPerWorkItem, typename InputIteratorT> |
| 496 | +__dpct_inline__ void load_direct_blocked(size_t linear_tid, |
| 497 | + InputIteratorT block_itr, |
| 498 | + InputT (&items)[ElementsPerWorkItem]) { |
490 | 499 | #pragma unroll |
491 | | - for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { |
492 | | - items[idx] = block_itr[workgroup_offset + idx]; |
| 500 | + for (size_t i = 0; i < ElementsPerWorkItem; i++) { |
| 501 | + items[i] = block_itr[(linear_tid * ElementsPerWorkItem) + i]; |
493 | 502 | } |
494 | 503 | } |
495 | 504 |
|
496 | | -// loads a linear segment of workgroup items into a striped arrangement. |
497 | | -template <typename InputT, size_t ITEMS_PER_WORK_ITEM, typename InputIteratorT, |
498 | | - typename Item> |
499 | | -__dpct_inline__ void load_direct_striped(const Item &item, InputIteratorT block_itr, |
500 | | - InputT (&items)[ITEMS_PER_WORK_ITEM]) { |
501 | | - |
502 | | - // This implementation does not take in account range loading across |
503 | | - // workgroup items To-do: Decide whether range loading is required for group |
504 | | - // loading |
505 | | - size_t linear_tid = item.get_local_linear_id(); |
506 | | - size_t group_work_items = item.get_local_range().size(); |
| 505 | +/// Load a linear segment of elements into a striped arrangement across the |
| 506 | +/// work-group. |
| 507 | +/// |
| 508 | +/// \tparam WorkGroupSize The work-group size. |
| 509 | +/// |
| 510 | +/// \tparam InputT The data type to load. |
| 511 | +/// |
| 512 | +/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned |
| 513 | +/// onto each work-item. |
| 514 | +/// |
| 515 | +/// \tparam InputIteratorT The random-access iterator type for input \iterator. |
| 516 | +/// |
| 517 | +/// \param linear_tid A suitable linear identifier for the calling work-item. |
| 518 | +/// |
| 519 | +/// \param block_itr The work-group's base input iterator for loading from. |
| 520 | +/// |
| 521 | +/// \param items Data to load |
| 522 | +template <size_t WorkGroupSize, typename InputT, int ElementsPerWorkItem, |
| 523 | + typename InputIteratorT> |
| 524 | +__dpct_inline__ void load_direct_striped(size_t linear_tid, |
| 525 | + InputIteratorT block_itr, |
| 526 | + InputT (&items)[ElementsPerWorkItem]) { |
507 | 527 | #pragma unroll |
508 | | - for (size_t idx = 0; idx < ITEMS_PER_WORK_ITEM; idx++) { |
509 | | - items[idx] = block_itr[linear_tid + (idx * group_work_items)]; |
| 528 | + for (size_t i = 0; i < ElementsPerWorkItem; i++) { |
| 529 | + items[i] = block_itr[linear_tid + i * WorkGroupSize]; |
510 | 530 | } |
511 | 531 | } |
512 | 532 |
|
513 | | - |
514 | 533 | // loads a linear segment of workgroup items into a subgroup striped |
515 | 534 | // arrangement. Created as free function until exchange mechanism is |
516 | 535 | // implemented. |
|
0 commit comments