Skip to content

Commit 563e4a4

Browse files
pkwasnie-inteligcbot
authored andcommitted
new intrinsic: subgroup clustered sorted ordinal
Adds new intrinsic: subgroup clustered sorted ordinal. Returns lane to which value would be moved on sort. Supports clusters of 8/16/32 lanes. Example: SIMD16 input: 0, 2,19, 4, 1, 5, 7, 9, 19, 7,18, 4,10, 5, 2, 3 Descending order with cluster size = 8 will produce 7, 5, 0, 4, 6, 3, 2, 1, 0, 3, 1, 5, 2, 4, 7, 6 Implementation is based on subgroup sort in group_sort.cl.
1 parent b180c4a commit 563e4a4

File tree

5 files changed

+191
-3
lines changed

5 files changed

+191
-3
lines changed

IGC/AdaptorOCL/ocl_igc_interface/igc_builtins.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ CIF_DEFINE_INTERFACE_VER(IgcBuiltins, 1) {
3131

3232
static constexpr auto sort = AlgorithmCoder::Enc("SORT");
3333
static constexpr auto clusteredSort = AlgorithmCoder::Enc("CLUSTER_SORT");
34+
static constexpr auto clusteredSortedOrdinal = AlgorithmCoder::Enc("CSORTED_ORD");
3435
} BuiltinAlgorithm;
3536

3637
typedef struct BuiltinMemoryType {

IGC/AdaptorOCL/ocl_igc_interface/impl/igc_builtins_impl.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,12 @@ bool CIF_GET_INTERFACE_CLASS(IgcBuiltins, 1)::GetBuiltinMemoryRequired(IGCBuilti
6060
{
6161
case BuiltinAlgorithm::sort:
6262
case BuiltinAlgorithm::clusteredSort:
63+
case BuiltinAlgorithm::clusteredSortedOrdinal:
6364
{
6465
const size_t bits_per_pass = 4;
6566

66-
if ((algorithm == BuiltinAlgorithm::clusteredSort) && (scope != BuiltinMemoryScope::subGroup))
67+
if ((algorithm == BuiltinAlgorithm::clusteredSort || algorithm == BuiltinAlgorithm::clusteredSortedOrdinal) &&
68+
(scope != BuiltinMemoryScope::subGroup))
6769
{
6870
return false;
6971
}

IGC/BiFModule/Implementation/group_sort.cl

Lines changed: 118 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ SPDX-License-Identifier: MIT
1111
constant uint RADIX_SORT_BITS_PER_PASS = 4;
1212
constant uint RADIX_SORT_CHAR_BIT = 8;
1313

14-
/* Default devicelib sub-group sort - bitonic sorting network, value-only */
14+
/* Default devicelib sub-group sort - bitonic sorting network.
15+
Two versions: key-value and value only. */
1516

1617
uint __builtin_sub_group_sort_mirror(uint idx, uint base)
1718
{
@@ -100,6 +101,87 @@ type OVERLOADABLE __builtin_sub_group_sort32(const type aa, const bool is_asc) \
100101
__builtin_sub_group_sort_rotate(slotID, 2), \
101102
__builtin_sub_group_sort_sel(slotID, 2), is_asc); \
102103
return gg; \
104+
} \
105+
\
106+
void OVERLOADABLE __builtin_sub_group_sort_compare_exchange_kv( \
107+
type *key0, uint *val0, const uint shuffleMask, const uint selectMask, \
108+
const bool is_asc) \
109+
{ \
110+
type key1 = sub_group_shuffle(*key0, shuffleMask); \
111+
type val1 = sub_group_shuffle(*val0, shuffleMask); \
112+
type key_min = min(*key0, key1); \
113+
type key_max = max(*key0, key1); \
114+
type val_min = (*key0 < key1) ? *val0 : val1; \
115+
type val_max = (*key0 <= key1) ? val1 : *val0; \
116+
if (selectMask) { \
117+
*key0 = is_asc ? key_max : key_min; \
118+
*val0 = is_asc ? val_max : val_min; \
119+
} else { \
120+
*key0 = is_asc ? key_min : key_max; \
121+
*val0 = is_asc ? val_min : val_max; \
122+
} \
123+
} \
124+
void OVERLOADABLE __builtin_sub_group_sort8_kv( \
125+
type *key, uint *val, const bool is_asc) \
126+
{ \
127+
const uint slotID = get_sub_group_local_id(); \
128+
__builtin_sub_group_sort_compare_exchange_kv(key, val, \
129+
__builtin_sub_group_sort_mirror(slotID, 2), \
130+
__builtin_sub_group_sort_sel(slotID, 2), is_asc); \
131+
__builtin_sub_group_sort_compare_exchange_kv(key, val, \
132+
__builtin_sub_group_sort_mirror(slotID, 4), \
133+
__builtin_sub_group_sort_sel(slotID, 4), is_asc); \
134+
__builtin_sub_group_sort_compare_exchange_kv(key, val, \
135+
__builtin_sub_group_sort_mirror(slotID, 2), \
136+
__builtin_sub_group_sort_sel(slotID, 2), is_asc); \
137+
__builtin_sub_group_sort_compare_exchange_kv(key, val, \
138+
__builtin_sub_group_sort_mirror(slotID, 8), \
139+
__builtin_sub_group_sort_sel(slotID, 8), is_asc); \
140+
__builtin_sub_group_sort_compare_exchange_kv(key, val, \
141+
__builtin_sub_group_sort_rotate(slotID, 4), \
142+
__builtin_sub_group_sort_sel(slotID, 4), is_asc); \
143+
__builtin_sub_group_sort_compare_exchange_kv(key, val, \
144+
__builtin_sub_group_sort_rotate(slotID, 2), \
145+
__builtin_sub_group_sort_sel(slotID, 2), is_asc); \
146+
} \
147+
void OVERLOADABLE __builtin_sub_group_sort16_kv( \
148+
type *key, uint *val, const bool is_asc) \
149+
{ \
150+
const uint slotID = get_sub_group_local_id(); \
151+
__builtin_sub_group_sort8_kv(key, val, is_asc); \
152+
__builtin_sub_group_sort_compare_exchange_kv(key, val, \
153+
__builtin_sub_group_sort_mirror(slotID, 16), \
154+
__builtin_sub_group_sort_sel(slotID, 16),is_asc); \
155+
__builtin_sub_group_sort_compare_exchange_kv(key, val, \
156+
__builtin_sub_group_sort_rotate(slotID, 8), \
157+
__builtin_sub_group_sort_sel(slotID, 8), is_asc); \
158+
__builtin_sub_group_sort_compare_exchange_kv(key, val, \
159+
__builtin_sub_group_sort_rotate(slotID, 4), \
160+
__builtin_sub_group_sort_sel(slotID, 4), is_asc); \
161+
__builtin_sub_group_sort_compare_exchange_kv(key, val, \
162+
__builtin_sub_group_sort_rotate(slotID, 2), \
163+
__builtin_sub_group_sort_sel(slotID, 2), is_asc); \
164+
} \
165+
void OVERLOADABLE __builtin_sub_group_sort32_kv( \
166+
type *key, uint *val, const bool is_asc) \
167+
{ \
168+
const uint slotID = get_sub_group_local_id(); \
169+
__builtin_sub_group_sort16_kv(key, val, is_asc); \
170+
__builtin_sub_group_sort_compare_exchange_kv(key, val, \
171+
__builtin_sub_group_sort_mirror(slotID, 32), \
172+
__builtin_sub_group_sort_sel(slotID, 32), is_asc); \
173+
__builtin_sub_group_sort_compare_exchange_kv(key, val, \
174+
__builtin_sub_group_sort_rotate(slotID, 16), \
175+
__builtin_sub_group_sort_sel(slotID, 16), is_asc); \
176+
__builtin_sub_group_sort_compare_exchange_kv(key, val, \
177+
__builtin_sub_group_sort_rotate(slotID, 8), \
178+
__builtin_sub_group_sort_sel(slotID, 8), is_asc); \
179+
__builtin_sub_group_sort_compare_exchange_kv(key, val, \
180+
__builtin_sub_group_sort_rotate(slotID, 4), \
181+
__builtin_sub_group_sort_sel(slotID, 4), is_asc); \
182+
__builtin_sub_group_sort_compare_exchange_kv(key, val, \
183+
__builtin_sub_group_sort_rotate(slotID, 2), \
184+
__builtin_sub_group_sort_sel(slotID, 2), is_asc); \
103185
}
104186

105187

@@ -1450,9 +1532,43 @@ type __builtin_IB_sub_group_clustered_sort_##direction##_##type_abbr(
14501532
return sorted; \
14511533
}
14521534

1535+
// clustered sorted ordinal - returns cluster lane when value is sorted
1536+
// Example: SIMD16 input:
1537+
// 0, 2,19, 4, 1, 5, 7, 9, 19, 7,18, 4,10, 5, 2, 3
1538+
// Result after sorted_ordinal_descend, cluster_size=8:
1539+
// 7, 5, 0, 4, 6, 3, 2, 1, 0, 3, 1, 5, 2, 4, 7, 6
1540+
#define DEFN_CLUSTERED_SUB_GROUP_SORTED_ORDINAL(type, type_abbr, direction, is_asc) \
1541+
type __builtin_IB_sub_group_clustered_sorted_ordinal_##direction##_##type_abbr( \
1542+
type value,uint cluster_size) \
1543+
{ \
1544+
type k = value; \
1545+
uint v = get_sub_group_local_id(); \
1546+
uint result = get_sub_group_local_id(); \
1547+
switch (cluster_size) \
1548+
{ \
1549+
case 8: \
1550+
__builtin_sub_group_sort8_kv(&k, &v, is_asc); \
1551+
__builtin_sub_group_sort8_kv(&v, &result, true); \
1552+
break; \
1553+
case 16: \
1554+
__builtin_sub_group_sort16_kv(&k, &v, is_asc); \
1555+
__builtin_sub_group_sort16_kv(&v, &result, true); \
1556+
break; \
1557+
case 32: \
1558+
__builtin_sub_group_sort32_kv(&k, &v, is_asc); \
1559+
__builtin_sub_group_sort32_kv(&v, &result, true); \
1560+
break; \
1561+
default: \
1562+
break; \
1563+
} \
1564+
return result % cluster_size; \
1565+
}
1566+
14531567
#define DEFN_CLUSTERED_SUB_GROUP_SORT(type, type_abbr) \
14541568
DEFN_CLUSTERED_SUB_GROUP_SORT_KEY_ONLY(type, type_abbr, ascend, true) \
1455-
DEFN_CLUSTERED_SUB_GROUP_SORT_KEY_ONLY(type, type_abbr, descend, false)
1569+
DEFN_CLUSTERED_SUB_GROUP_SORT_KEY_ONLY(type, type_abbr, descend, false) \
1570+
DEFN_CLUSTERED_SUB_GROUP_SORTED_ORDINAL(type, type_abbr, ascend, true) \
1571+
DEFN_CLUSTERED_SUB_GROUP_SORTED_ORDINAL(type, type_abbr, descend, false)
14561572

14571573
DEFN_CLUSTERED_SUB_GROUP_SORT(char, i8)
14581574
DEFN_CLUSTERED_SUB_GROUP_SORT(short, i16)

IGC/Compiler/Optimizer/OpenCLPasses/SubGroupReductionPattern/SubGroupReductionPattern.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,9 @@ void SubGroupReductionPattern::matchShufflePattern(GenIntrinsicInst &ShuffleOp,
269269
return matchVectorShufflePattern(ShuffleOp, Lane);
270270
}
271271

272+
if (getWaveOp(Op) == WaveOps::UNDEF)
273+
return;
274+
272275
Value *InputValue = ShuffleOp.getOperand(0);
273276
if (((Op->getOperand(0) == InputValue && Op->getOperand(1) == &ShuffleOp) || (Op->getOperand(0) == &ShuffleOp && Op->getOperand(1) == InputValue)) == false)
274277
return;
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2024 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
;
9+
; REQUIRES: llvm-14-plus
10+
;
11+
; RUN: igc_opt -debugify -igc-subgroup-reduction-pattern -check-debugify -S < %s 2>&1 | FileCheck %s
12+
13+
; Test if a pattern of repeated ShuffleXor + op is recognized and replaced with WaveAll.
14+
15+
; Debug-info related check
16+
; CHECK: CheckModuleDebugify: PASS
17+
18+
define float @shuffle_xor_invalid_pattern() {
19+
entry:
20+
; COM: Check no change is applied.
21+
;
22+
; CHECK-LABEL: entry:
23+
; CHECK: %0 = call i32 @get_i32()
24+
; CHECK: %simdShuffleXor = call i32 @llvm.genx.GenISA.simdShuffleXor.i32(i32 %0, i32 8)
25+
; CHECK: %1 = uitofp i32 %simdShuffleXor to float
26+
; CHECK: ret float %1
27+
%0 = call i32 @get_i32()
28+
%simdShuffleXor = call i32 @llvm.genx.GenISA.simdShuffleXor.i32(i32 %0, i32 8)
29+
%1 = uitofp i32 %simdShuffleXor to float
30+
ret float %1
31+
}
32+
33+
define float @shuffle_index_invalid_pattern() {
34+
entry:
35+
; COM: Check no change is applied.
36+
;
37+
; CHECK-LABEL: entry:
38+
; CHECK: %simdLaneId = call i16 @llvm.genx.GenISA.simdLaneId()
39+
; CHECK: %0 = call i32 @get_i32()
40+
; CHECK: %xor16 = xor i16 %simdLaneId, 16
41+
; CHECK: %zext16 = zext i16 %xor16 to i32
42+
; CHECK: %simdShuffle = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %0, i32 %zext16, i32 0)
43+
; CHECK: %1 = uitofp i32 %simdShuffle to float
44+
; CHECK: ret float %1
45+
%simdLaneId = call i16 @llvm.genx.GenISA.simdLaneId()
46+
%0 = call i32 @get_i32()
47+
%xor16 = xor i16 %simdLaneId, 16
48+
%zext16 = zext i16 %xor16 to i32
49+
%simdShuffle = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %0, i32 %zext16, i32 0)
50+
%1 = uitofp i32 %simdShuffle to float
51+
ret float %1
52+
}
53+
54+
declare i32 @llvm.genx.GenISA.simdShuffleXor.i32(i32, i32)
55+
declare i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32, i32, i32)
56+
declare i16 @llvm.genx.GenISA.simdLaneId()
57+
58+
declare i32 @get_i32()
59+
60+
!igc.functions = !{!0, !1}
61+
62+
!0 = !{float ()* @shuffle_xor_invalid_pattern, !100}
63+
!1 = !{float ()* @shuffle_index_invalid_pattern, !100}
64+
!100 = !{!101, !102}
65+
!101 = !{!"function_type", i32 0}
66+
!102 = !{!"sub_group_size", i32 16}

0 commit comments

Comments
 (0)