Skip to content

Commit 3a0fa12

Browse files
authored
DAG: Handle half spanning extract_subvector in type legalization (#154101)
Previously it would just assert if the extract needed elements from both halves. Extract the individual elements from both halves and create a new vector, as the simplest implementation. This could try to do better and create a partial extract or shuffle (or maybe that's best left for the combiner to figure out later). Fixes secondary issue noticed as part of #153808
1 parent d611a9c commit 3a0fa12

File tree

2 files changed

+157
-3
lines changed

2 files changed

+157
-3
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3845,9 +3845,22 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
38453845
unsigned NumResultElts = SubVT.getVectorMinNumElements();
38463846

38473847
if (IdxVal < LoEltsMin) {
3848-
assert(IdxVal + NumResultElts <= LoEltsMin &&
3849-
"Extracted subvector crosses vector split!");
3850-
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx);
3848+
// If the extracted elements are all in the low half, do a simple extract.
3849+
if (IdxVal + NumResultElts <= LoEltsMin)
3850+
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx);
3851+
3852+
// Extracted subvector crosses vector split, so we need to blend the two
3853+
// halves.
3854+
// TODO: May be able to emit partial extract_subvector.
3855+
SmallVector<SDValue, 8> Elts;
3856+
Elts.reserve(NumResultElts);
3857+
3858+
DAG.ExtractVectorElements(Lo, Elts, /*Start=*/IdxVal,
3859+
/*Count=*/LoEltsMin - IdxVal);
3860+
DAG.ExtractVectorElements(Hi, Elts, /*Start=*/0,
3861+
/*Count=*/SubVT.getVectorNumElements() -
3862+
Elts.size());
3863+
return DAG.getBuildVector(SubVT, dl, Elts);
38513864
}
38523865

38533866
EVT SrcVT = N->getOperand(0).getValueType();

llvm/test/CodeGen/AMDGPU/issue153808-extract-subvector-legalize.ll

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,147 @@
22
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
33
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX942 %s
44

5+
define <3 x float> @extract_subvector_v3f32_v33f32_elt30_0(ptr addrspace(1) %ptr) #0 {
6+
; GFX900-LABEL: extract_subvector_v3f32_v33f32_elt30_0:
7+
; GFX900: ; %bb.0:
8+
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9+
; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:96 glc
10+
; GFX900-NEXT: s_waitcnt vmcnt(0)
11+
; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:80 glc
12+
; GFX900-NEXT: s_waitcnt vmcnt(0)
13+
; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:64 glc
14+
; GFX900-NEXT: s_waitcnt vmcnt(0)
15+
; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:48 glc
16+
; GFX900-NEXT: s_waitcnt vmcnt(0)
17+
; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:32 glc
18+
; GFX900-NEXT: s_waitcnt vmcnt(0)
19+
; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
20+
; GFX900-NEXT: s_waitcnt vmcnt(0)
21+
; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
22+
; GFX900-NEXT: s_waitcnt vmcnt(0)
23+
; GFX900-NEXT: global_load_dword v2, v[0:1], off offset:128 glc
24+
; GFX900-NEXT: s_waitcnt vmcnt(0)
25+
; GFX900-NEXT: global_load_dwordx4 v[3:6], v[0:1], off offset:112 glc
26+
; GFX900-NEXT: s_waitcnt vmcnt(0)
27+
; GFX900-NEXT: v_mov_b32_e32 v0, v5
28+
; GFX900-NEXT: v_mov_b32_e32 v1, v6
29+
; GFX900-NEXT: s_setpc_b64 s[30:31]
30+
;
31+
; GFX942-LABEL: extract_subvector_v3f32_v33f32_elt30_0:
32+
; GFX942: ; %bb.0:
33+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34+
; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:96 sc0 sc1
35+
; GFX942-NEXT: s_waitcnt vmcnt(0)
36+
; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:80 sc0 sc1
37+
; GFX942-NEXT: s_waitcnt vmcnt(0)
38+
; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:64 sc0 sc1
39+
; GFX942-NEXT: s_waitcnt vmcnt(0)
40+
; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:48 sc0 sc1
41+
; GFX942-NEXT: s_waitcnt vmcnt(0)
42+
; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:32 sc0 sc1
43+
; GFX942-NEXT: s_waitcnt vmcnt(0)
44+
; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 sc0 sc1
45+
; GFX942-NEXT: s_waitcnt vmcnt(0)
46+
; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off sc0 sc1
47+
; GFX942-NEXT: s_waitcnt vmcnt(0)
48+
; GFX942-NEXT: global_load_dword v2, v[0:1], off offset:128 sc0 sc1
49+
; GFX942-NEXT: s_waitcnt vmcnt(0)
50+
; GFX942-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:112 sc0 sc1
51+
; GFX942-NEXT: s_waitcnt vmcnt(0)
52+
; GFX942-NEXT: v_mov_b32_e32 v0, v6
53+
; GFX942-NEXT: v_mov_b32_e32 v1, v7
54+
; GFX942-NEXT: s_setpc_b64 s[30:31]
55+
%val = load volatile <33 x float>, ptr addrspace(1) %ptr, align 4
56+
%extract.subvector = shufflevector <33 x float> %val, <33 x float> poison, <3 x i32> <i32 30, i32 31, i32 32>
57+
ret <3 x float> %extract.subvector
58+
}
59+
60+
define <3 x float> @extract_subvector_v3f32_v33f32_elt30_1(ptr addrspace(1) %ptr) #0 {
61+
; GFX900-LABEL: extract_subvector_v3f32_v33f32_elt30_1:
62+
; GFX900: ; %bb.0:
63+
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
64+
; GFX900-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
65+
; GFX900-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:112
66+
; GFX900-NEXT: global_load_dword v2, v[0:1], off offset:128
67+
; GFX900-NEXT: s_mov_b32 s4, 0
68+
; GFX900-NEXT: s_mov_b32 s5, s4
69+
; GFX900-NEXT: s_mov_b32 s6, s4
70+
; GFX900-NEXT: s_mov_b32 s7, s4
71+
; GFX900-NEXT: s_waitcnt vmcnt(2)
72+
; GFX900-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
73+
; GFX900-NEXT: s_waitcnt vmcnt(2)
74+
; GFX900-NEXT: v_mov_b32_e32 v0, v9
75+
; GFX900-NEXT: v_mov_b32_e32 v1, v10
76+
; GFX900-NEXT: s_waitcnt vmcnt(0)
77+
; GFX900-NEXT: s_setpc_b64 s[30:31]
78+
;
79+
; GFX942-LABEL: extract_subvector_v3f32_v33f32_elt30_1:
80+
; GFX942: ; %bb.0:
81+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82+
; GFX942-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
83+
; GFX942-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:112
84+
; GFX942-NEXT: global_load_dword v2, v[0:1], off offset:128
85+
; GFX942-NEXT: s_mov_b32 s0, 0
86+
; GFX942-NEXT: s_mov_b32 s1, s0
87+
; GFX942-NEXT: s_mov_b32 s2, s0
88+
; GFX942-NEXT: s_mov_b32 s3, s0
89+
; GFX942-NEXT: s_waitcnt vmcnt(2)
90+
; GFX942-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
91+
; GFX942-NEXT: s_waitcnt vmcnt(2)
92+
; GFX942-NEXT: v_mov_b32_e32 v0, v10
93+
; GFX942-NEXT: v_mov_b32_e32 v1, v11
94+
; GFX942-NEXT: s_waitcnt vmcnt(0)
95+
; GFX942-NEXT: s_setpc_b64 s[30:31]
96+
%val = load <33 x float>, ptr addrspace(1) %ptr, align 4
97+
%val.slice.0 = shufflevector <33 x float> %val, <33 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
98+
call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %val.slice.0, ptr addrspace(8) null, i32 0, i32 0, i32 0)
99+
%val.slice.48 = shufflevector <33 x float> %val, <33 x float> poison, <3 x i32> <i32 30, i32 31, i32 32>
100+
ret <3 x float> %val.slice.48
101+
}
102+
103+
define <6 x float> @extract_subvector_v6f32_v36f32_elt30(ptr addrspace(1) %ptr) #0 {
104+
; GFX900-LABEL: extract_subvector_v6f32_v36f32_elt30:
105+
; GFX900: ; %bb.0:
106+
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
107+
; GFX900-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
108+
; GFX900-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:112
109+
; GFX900-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:128
110+
; GFX900-NEXT: s_mov_b32 s4, 0
111+
; GFX900-NEXT: s_mov_b32 s5, s4
112+
; GFX900-NEXT: s_mov_b32 s6, s4
113+
; GFX900-NEXT: s_mov_b32 s7, s4
114+
; GFX900-NEXT: s_waitcnt vmcnt(2)
115+
; GFX900-NEXT: buffer_store_dwordx4 v[6:9], off, s[4:7], 0
116+
; GFX900-NEXT: s_waitcnt vmcnt(2)
117+
; GFX900-NEXT: v_mov_b32_e32 v0, v12
118+
; GFX900-NEXT: v_mov_b32_e32 v1, v13
119+
; GFX900-NEXT: s_waitcnt vmcnt(0)
120+
; GFX900-NEXT: s_setpc_b64 s[30:31]
121+
;
122+
; GFX942-LABEL: extract_subvector_v6f32_v36f32_elt30:
123+
; GFX942: ; %bb.0:
124+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
125+
; GFX942-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
126+
; GFX942-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:112
127+
; GFX942-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:128
128+
; GFX942-NEXT: s_mov_b32 s0, 0
129+
; GFX942-NEXT: s_mov_b32 s1, s0
130+
; GFX942-NEXT: s_mov_b32 s2, s0
131+
; GFX942-NEXT: s_mov_b32 s3, s0
132+
; GFX942-NEXT: s_waitcnt vmcnt(2)
133+
; GFX942-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0
134+
; GFX942-NEXT: s_waitcnt vmcnt(2)
135+
; GFX942-NEXT: v_mov_b32_e32 v0, v12
136+
; GFX942-NEXT: v_mov_b32_e32 v1, v13
137+
; GFX942-NEXT: s_waitcnt vmcnt(0)
138+
; GFX942-NEXT: s_setpc_b64 s[30:31]
139+
%val = load <36 x float>, ptr addrspace(1) %ptr, align 4
140+
%val.slice.0 = shufflevector <36 x float> %val, <36 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
141+
call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %val.slice.0, ptr addrspace(8) null, i32 0, i32 0, i32 0)
142+
%val.slice.1 = shufflevector <36 x float> %val, <36 x float> poison, <6 x i32> <i32 30, i32 31, i32 32, i32 33, i32 34, i32 35>
143+
ret <6 x float> %val.slice.1
144+
}
145+
5146
define <3 x float> @issue153808_vector_extract_assert(ptr addrspace(1) %ptr) #0 {
6147
; GFX900-LABEL: issue153808_vector_extract_assert:
7148
; GFX900: ; %bb.0:

0 commit comments

Comments
 (0)