Skip to content

Commit e94a0b3

Browse files
authored
[AMDGPU] Fix vector legalization for bf16 valu ops (#158439)
Add v4,v8,v16,v32 legalizations for the following operations: - `FADD` - `FMUL` - `FMA` - `FCANONICALIZE`
1 parent dfc8854 commit e94a0b3

File tree

9 files changed

+2023
-464
lines changed

9 files changed

+2023
-464
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -607,13 +607,15 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
607607
case ISD::FSUB:
608608
if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
609609
NElts = (NElts + 1) / 2;
610+
if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
611+
NElts = (NElts + 1) / 2;
610612
if (SLT == MVT::f64)
611613
return LT.first * NElts * get64BitInstrCost(CostKind);
612614

613615
if (ST->has16BitInsts() && SLT == MVT::f16)
614616
NElts = (NElts + 1) / 2;
615617

616-
if (SLT == MVT::f32 || SLT == MVT::f16)
618+
if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
617619
return LT.first * NElts * getFullRateInstrCost();
618620
break;
619621
case ISD::FDIV:
@@ -746,7 +748,9 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
746748

747749
MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
748750

749-
if ((ST->hasVOP3PInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||
751+
if ((ST->hasVOP3PInsts() &&
752+
(SLT == MVT::f16 || SLT == MVT::i16 ||
753+
(SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
750754
(ST->hasPackedFP32Ops() && SLT == MVT::f32))
751755
NElts = (NElts + 1) / 2;
752756

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -851,6 +851,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
851851
setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
852852
Custom);
853853

854+
if (Subtarget->hasBF16PackedInsts()) {
855+
for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
856+
// Split vector operations.
857+
setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
858+
VT, Custom);
859+
}
860+
854861
if (Subtarget->hasPackedFP32Ops()) {
855862
setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
856863
MVT::v2f32, Legal);
@@ -6621,10 +6628,12 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
66216628
SelectionDAG &DAG) const {
66226629
unsigned Opc = Op.getOpcode();
66236630
EVT VT = Op.getValueType();
6624-
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6625-
VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6626-
VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6627-
VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6631+
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6632+
VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6633+
VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6634+
VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6635+
VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6636+
VT == MVT::v32bf16);
66286637

66296638
auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
66306639

llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll

Lines changed: 62 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
44
; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
55
; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
6+
; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=GFX1250 %s
67

78
; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
89
; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
910
; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
1011
; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
12+
; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=GFX1250-SIZE %s
1113

1214
define void @canonicalize_f16() {
1315
; BASE-LABEL: 'canonicalize_f16'
@@ -141,6 +143,16 @@ define void @canonicalize_bf16() {
141143
; GFX10-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
142144
; GFX10-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
143145
;
146+
; GFX1250-LABEL: 'canonicalize_bf16'
147+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
148+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
149+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
150+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
151+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
152+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
153+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
154+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
155+
;
144156
; BASE-SIZE-LABEL: 'canonicalize_bf16'
145157
; BASE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
146158
; BASE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
@@ -181,6 +193,15 @@ define void @canonicalize_bf16() {
181193
; GFX10-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
182194
; GFX10-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
183195
;
196+
; GFX1250-SIZE-LABEL: 'canonicalize_bf16'
197+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
198+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
199+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
200+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
201+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
202+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
203+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
204+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
184205
%bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) #1
185206
%v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) #1
186207
%v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef) #1
@@ -203,6 +224,17 @@ define void @canonicalize_f32() {
203224
; ALL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
204225
; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
205226
;
227+
; GFX1250-LABEL: 'canonicalize_f32'
228+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef)
229+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef)
230+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef)
231+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef)
232+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef)
233+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef)
234+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef)
235+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
236+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
237+
;
206238
; ALL-SIZE-LABEL: 'canonicalize_f32'
207239
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef)
208240
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef)
@@ -214,6 +246,16 @@ define void @canonicalize_f32() {
214246
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
215247
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
216248
;
249+
; GFX1250-SIZE-LABEL: 'canonicalize_f32':
250+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef)
251+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef)
252+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef)
253+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef)
254+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef)
255+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef)
256+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef)
257+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
258+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
217259
%f32 = call float @llvm.canonicalize.f32(float undef) #1
218260
%v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) #1
219261
%v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef) #1
@@ -236,6 +278,16 @@ define void @canonicalize_f64() {
236278
; ALL-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
237279
; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
238280
;
281+
; GFX1250-LABEL: 'canonicalize_f64'
282+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef)
283+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef)
284+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef)
285+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef)
286+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef)
287+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef)
288+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
289+
; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
290+
;
239291
; ALL-SIZE-LABEL: 'canonicalize_f64'
240292
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef)
241293
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef)
@@ -245,6 +297,16 @@ define void @canonicalize_f64() {
245297
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef)
246298
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
247299
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
300+
;
301+
; GFX1250-SIZE-LABEL: 'canonicalize_f64'
302+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef)
303+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef)
304+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef)
305+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef)
306+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef)
307+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef)
308+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
309+
; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
248310
;
249311
%f64 = call double @llvm.canonicalize.f64(double undef) #1
250312
%v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) #1
@@ -255,9 +317,3 @@ define void @canonicalize_f64() {
255317
%v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) #1
256318
ret void
257319
}
258-
259-
260-
261-
262-
263-

0 commit comments

Comments
 (0)