Skip to content

Commit c876d53

Browse files
authored
DAG: Avoid creating illegal extract_subvector in legalizer (#154100)
Fixes #153808
1 parent 876fdc9 commit c876d53

File tree

2 files changed

+74
-4
lines changed

2 files changed

+74
-4
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3842,13 +3842,32 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
38423842
uint64_t LoEltsMin = Lo.getValueType().getVectorMinNumElements();
38433843
uint64_t IdxVal = Idx->getAsZExtVal();
38443844

3845+
unsigned NumResultElts = SubVT.getVectorMinNumElements();
3846+
38453847
if (IdxVal < LoEltsMin) {
3846-
assert(IdxVal + SubVT.getVectorMinNumElements() <= LoEltsMin &&
3848+
assert(IdxVal + NumResultElts <= LoEltsMin &&
38473849
"Extracted subvector crosses vector split!");
38483850
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx);
3849-
} else if (SubVT.isScalableVector() ==
3850-
N->getOperand(0).getValueType().isScalableVector())
3851-
return DAG.getExtractSubvector(dl, SubVT, Hi, IdxVal - LoEltsMin);
3851+
}
3852+
3853+
EVT SrcVT = N->getOperand(0).getValueType();
3854+
if (SubVT.isScalableVector() == SrcVT.isScalableVector()) {
3855+
uint64_t ExtractIdx = IdxVal - LoEltsMin;
3856+
if (ExtractIdx % NumResultElts == 0)
3857+
return DAG.getExtractSubvector(dl, SubVT, Hi, ExtractIdx);
3858+
3859+
// We cannot create an extract_subvector that isn't a multiple of the result
3860+
// size, which may go out of bounds for the last elements. Shuffle the
3861+
// desired elements down to 0 and do a simple 0 extract.
3862+
EVT HiVT = Hi.getValueType();
3863+
SmallVector<int, 8> Mask(HiVT.getVectorNumElements(), -1);
3864+
for (int I = 0; I != static_cast<int>(NumResultElts); ++I)
3865+
Mask[I] = ExtractIdx + I;
3866+
3867+
SDValue Shuffle =
3868+
DAG.getVectorShuffle(HiVT, dl, Hi, DAG.getPOISON(HiVT), Mask);
3869+
return DAG.getExtractSubvector(dl, SubVT, Shuffle, 0);
3870+
}
38523871

38533872
// After this point the DAG node only permits extracting fixed-width
38543873
// subvectors from scalable vectors.
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX942 %s
4+
5+
define <3 x float> @issue153808_vector_extract_assert(ptr addrspace(1) %ptr) #0 {
6+
; GFX900-LABEL: issue153808_vector_extract_assert:
7+
; GFX900: ; %bb.0:
8+
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9+
; GFX900-NEXT: v_mov_b32_e32 v4, v1
10+
; GFX900-NEXT: v_mov_b32_e32 v3, v0
11+
; GFX900-NEXT: global_load_dwordx4 v[5:8], v[3:4], off
12+
; GFX900-NEXT: global_load_dwordx3 v[0:2], v[3:4], off offset:192
13+
; GFX900-NEXT: s_mov_b32 s4, 0
14+
; GFX900-NEXT: s_mov_b32 s5, s4
15+
; GFX900-NEXT: s_mov_b32 s6, s4
16+
; GFX900-NEXT: s_mov_b32 s7, s4
17+
; GFX900-NEXT: s_waitcnt vmcnt(1)
18+
; GFX900-NEXT: buffer_store_dwordx4 v[5:8], off, s[4:7], 0
19+
; GFX900-NEXT: s_waitcnt vmcnt(0)
20+
; GFX900-NEXT: s_setpc_b64 s[30:31]
21+
;
22+
; GFX942-LABEL: issue153808_vector_extract_assert:
23+
; GFX942: ; %bb.0:
24+
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25+
; GFX942-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
26+
; GFX942-NEXT: global_load_dwordx3 v[2:4], v[0:1], off offset:192
27+
; GFX942-NEXT: s_mov_b32 s0, 0
28+
; GFX942-NEXT: s_mov_b32 s1, s0
29+
; GFX942-NEXT: s_mov_b32 s2, s0
30+
; GFX942-NEXT: s_mov_b32 s3, s0
31+
; GFX942-NEXT: s_waitcnt vmcnt(1)
32+
; GFX942-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0
33+
; GFX942-NEXT: s_waitcnt vmcnt(1)
34+
; GFX942-NEXT: v_mov_b32_e32 v0, v2
35+
; GFX942-NEXT: v_mov_b32_e32 v1, v3
36+
; GFX942-NEXT: v_mov_b32_e32 v2, v4
37+
; GFX942-NEXT: s_waitcnt vmcnt(0)
38+
; GFX942-NEXT: s_setpc_b64 s[30:31]
39+
%val = load <51 x float>, ptr addrspace(1) %ptr, align 4
40+
%val.slice.0 = shufflevector <51 x float> %val, <51 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
41+
call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %val.slice.0, ptr addrspace(8) null, i32 0, i32 0, i32 0)
42+
%val.slice.48 = shufflevector <51 x float> %val, <51 x float> poison, <3 x i32> <i32 48, i32 49, i32 50>
43+
ret <3 x float> %val.slice.48
44+
}
45+
46+
declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8) writeonly captures(none), i32, i32, i32 immarg) #1
47+
48+
attributes #0 = { nounwind }
49+
attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
50+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
51+
; GFX9: {{.*}}

0 commit comments

Comments
 (0)