From 8ca54e68c9e4b594663f0f3c404f2573a56a37e1 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Sat, 13 Sep 2025 00:49:08 +0100 Subject: [PATCH 1/4] [AMDGPU] Fix vector legalization for bf16 valu ops Add v4,v8,v16,v32 legalizations for the following operations: - FADD - FMUL - FMA - FCANONICALIZE --- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 6 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 ++ .../Analysis/CostModel/AMDGPU/canonicalize.ll | 68 +++++++++++++++++-- llvm/test/CodeGen/AMDGPU/bf16.ll | 30 +++++--- 4 files changed, 93 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 3e2b2c3510569..b07e936c494f6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -607,6 +607,8 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost( case ISD::FSUB: if (ST->hasPackedFP32Ops() && SLT == MVT::f32) NElts = (NElts + 1) / 2; + if (ST->hasBF16PackedInsts() && SLT == MVT::bf16) + NElts = (NElts + 1) / 2; if (SLT == MVT::f64) return LT.first * NElts * get64BitInstrCost(CostKind); @@ -746,7 +748,9 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; - if ((ST->hasVOP3PInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) || + if ((ST->hasVOP3PInsts() && + (SLT == MVT::f16 || SLT == MVT::i16 || + (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) || (ST->hasPackedFP32Ops() && SLT == MVT::f32)) NElts = (NElts + 1) / 2; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a53beaa2b6f91..6f761bc1dc11e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -851,6 +851,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16}, Custom); + if (Subtarget->hasBF16PackedInsts()) { + for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16}) + // Split vector operations. + setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE}, + VT, Custom); + } + if (Subtarget->hasPackedFP32Ops()) { setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG}, MVT::v2f32, Legal); diff --git a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll index 7ac4db3119210..904db9064a369 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll @@ -3,11 +3,13 @@ ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s +; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=GFX1250 %s ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s +; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=GFX1250-SIZE %s define void @canonicalize_f16() { ; BASE-LABEL: 'canonicalize_f16' @@ -141,6 +143,16 @@ define void @canonicalize_bf16() { ; GFX10-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef) ; GFX10-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; +; GFX1250-LABEL: 'canonicalize_bf16' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; ; BASE-SIZE-LABEL: 'canonicalize_bf16' ; BASE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) ; BASE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) @@ -181,6 +193,15 @@ define void @canonicalize_bf16() { ; GFX10-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef) ; GFX10-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; +; GFX1250-SIZE-LABEL: 'canonicalize_bf16' +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) #1 %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) #1 %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef) #1 @@ -203,6 +224,17 @@ define void @canonicalize_f32() { ; ALL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef) ; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; +; GFX1250-LABEL: 'canonicalize_f32' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; ; ALL-SIZE-LABEL: 'canonicalize_f32' ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) @@ -214,6 +246,16 @@ define void @canonicalize_f32() { ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; +; GFX1250-SIZE-LABEL: 'canonicalize_f32': +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void %f32 = call float @llvm.canonicalize.f32(float undef) #1 %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) #1 %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef) #1 @@ -236,6 +278,16 @@ define void @canonicalize_f64() { ; ALL-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) ; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; +; GFX1250-LABEL: 'canonicalize_f64' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; ; ALL-SIZE-LABEL: 'canonicalize_f64' ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) @@ -245,6 +297,16 @@ define void @canonicalize_f64() { ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; GFX1250-SIZE-LABEL: 'canonicalize_f64' +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %f64 = call double @llvm.canonicalize.f64(double undef) #1 %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) #1 @@ -255,9 +317,3 @@ define void @canonicalize_f64() { %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) #1 ret void } - - - - - - diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 371e460d9638e..b5d7ddf5aee89 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -7,10 +7,14 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16 +<<<<<<< HEAD ; xUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250TRUE16 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250FAKE16 ; FIXME: real-true16 version of gfx1250 test fails +======= +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 | FileCheck %s -check-prefixes=GFX1250 +>>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops) define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GCN-LABEL: test_load_store: @@ -10908,13 +10912,12 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] -; ; GFX1250-LABEL: v_fadd_v2bf16: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v1 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fadd <2 x bfloat> %a, %b ret <2 x bfloat> %op } @@ -11447,14 +11450,13 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] -; ; GFX1250-LABEL: v_fadd_v4bf16: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v2 -; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v3 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v2 +; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fadd <4 x bfloat> %a, %b ret <4 x bfloat> %op } @@ -49361,7 +49363,10 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +<<<<<<< HEAD ; +======= +>>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops) ; GFX1250-LABEL: v_fma_v2bf16: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -49978,7 +49983,10 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +<<<<<<< HEAD ; +======= +>>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops) ; GFX1250-LABEL: v_fma_v4bf16: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 From 3f99c0c409dc42244321f62833a97c573263d948 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Fri, 19 Sep 2025 09:38:09 +0100 Subject: [PATCH 2/4] Add testing coverage - part I --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 11 +- llvm/test/Analysis/CostModel/AMDGPU/fadd.ll | 31 + llvm/test/Analysis/CostModel/AMDGPU/fma.ll | 20 + llvm/test/Analysis/CostModel/AMDGPU/fmul.ll | 21 + llvm/test/Analysis/CostModel/AMDGPU/fsub.ll | 33 + llvm/test/CodeGen/AMDGPU/bf16.ll | 10 + .../test/CodeGen/AMDGPU/fcanonicalize.bf16.ll | 1292 +++++++++++++++++ 7 files changed, 1414 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6f761bc1dc11e..77f0c55100981 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6628,10 +6628,12 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || - VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || - VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || - VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); + assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 || + VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || + VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 || + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 || + VT == MVT::v32bf16); auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -6686,6 +6688,7 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); + VT.dump(); assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll index 55994d865fa6c..d9729479f7410 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll @@ -2,9 +2,11 @@ ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,GFX90A-FASTF64 %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32,FASTF16,FASTF64 %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32,SLOWF64 %s +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16-SIZE,GFX90A-FASTF64-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,FASTF16-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,SLOWF64-SIZE %s +; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s ; END. define amdgpu_kernel void @fadd_f32() #0 { @@ -158,4 +160,33 @@ define amdgpu_kernel void @fadd_f16() #0 { ret void } +define amdgpu_kernel void @fadd_bf16() #0 { +; GFX1250-LABEL: 'fadd_bf16' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = fadd bfloat undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fadd <2 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fadd <3 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fadd <4 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fadd <5 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16bf16 = fadd <16 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v17bf16 = fadd <17 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; GFX1250-SIZE-LABEL: 'fadd_bf16' +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fadd bfloat undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fadd <2 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = fadd <3 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = fadd <4 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = fadd <5 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = fadd <16 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v17bf16 = fadd <17 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void + %bf16 = fadd bfloat undef, undef + %v2bf16 = fadd <2 x bfloat> undef, undef + %v3bf16 = fadd <3 x bfloat> undef, undef + %v4bf16 = fadd <4 x bfloat> undef, undef + %v5bf16 = fadd <5 x bfloat> undef, undef + %v16bf16 = fadd <16 x bfloat> undef, undef + %v17bf16 = fadd <17 x bfloat> undef, undef + ret void +} + attributes #0 = { nounwind } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll index 2ff9d4f7f5e38..db2170af2c801 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll @@ -2,10 +2,12 @@ ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW %s +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=SLOW-SIZE %s +; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s define void @fma_f16() { @@ -100,6 +102,24 @@ define void @fma_bf16() { ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; +; GFX1250-LABEL: 'fma_bf16' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; GFX1250-SIZE-LABEL: 'fma_bf16' +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll index adc4eea309a58..5315852d3225c 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll @@ -2,9 +2,11 @@ ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX9,GFX90A-FASTF64 %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX9,F32,FASTF64 %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=F32,SLOW %s +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX9-SIZE,GFX90A-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,SLOW-SIZE %s +; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s ; END. define amdgpu_kernel void @fmul_f32() #0 { @@ -179,6 +181,16 @@ define amdgpu_kernel void @fmul_bf16() #0 { ; SLOW-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; +; GFX1250-LABEL: 'fmul_bf16' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = fmul bfloat undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; ; GFX9-SIZE-LABEL: 'fmul_bf16' ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef @@ -199,6 +211,15 @@ define amdgpu_kernel void @fmul_bf16() #0 { ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; +; GFX1250-SIZE-LABEL: 'fmul_bf16' +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void %bf16 = fmul bfloat undef, undef %v2bf16 = fmul <2 x bfloat> undef, undef %v3bf16 = fmul <3 x bfloat> undef, undef diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll index 4e71a71326bad..61929a64244d6 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll @@ -2,9 +2,11 @@ ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,GFX90A-FASTF64 %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32,FASTF16,FASTF64 %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32,SLOWF64 %s +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16-SIZE,GFX90A-FASTF64-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,FASTF16-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,SLOWF64-SIZE %s +; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s ; END. define amdgpu_kernel void @fsub_f32() #0 { @@ -157,3 +159,34 @@ define amdgpu_kernel void @fsub_f16() #0 { %v17f16 = fsub <17 x half> undef, undef ret void } + +define amdgpu_kernel void @fsub_bf16() #0 { +; GFX1250-LABEL: 'fsub_bf16' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = fsub bfloat undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2bf16 = fsub <2 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = fsub <3 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4bf16 = fsub <4 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v5bf16 = fsub <5 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v16bf16 = fsub <16 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v17bf16 = fsub <17 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; +; GFX1250-SIZE-LABEL: 'fsub_bf16' +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fsub bfloat undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fsub <2 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = fsub <3 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = fsub <4 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = fsub <5 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = fsub <16 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v17bf16 = fsub <17 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %bf16 = fsub bfloat undef, undef + %v2bf16 = fsub <2 x bfloat> undef, undef + %v3bf16 = fsub <3 x bfloat> undef, undef + %v4bf16 = fsub <4 x bfloat> undef, undef + %v5bf16 = fsub <5 x bfloat> undef, undef + %v16bf16 = fsub <16 x bfloat> undef, undef + %v17bf16 = fsub <17 x bfloat> undef, undef + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index b5d7ddf5aee89..e10141f9ba809 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -49641,6 +49641,7 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +<<<<<<< HEAD ; ; GFX1250-LABEL: v_fma_v3bf16: ; GFX1250: ; %bb.0: @@ -49649,6 +49650,15 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4 ; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +======= +; GFX1250-LABEL: v_fma_v3bf16: +; GFX1250: %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4 +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +>>>>>>> cc3762e87c75 (Add testing coverage - part I) %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) ret <3 x bfloat> %op } diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll new file mode 100644 index 0000000000000..a4cdb0387df9a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll @@ -0,0 +1,1292 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250 %s + +declare bfloat @llvm.fabs.bf16(bfloat) #0 +declare bfloat @llvm.canonicalize.bf16(bfloat) #0 +declare <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat>) #0 +declare <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat>) #0 +declare <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat>) #0 +declare <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat>) #0 +declare <6 x bfloat> @llvm.canonicalize.v6bf16(<6 x bfloat>) #0 +declare <8 x bfloat> @llvm.canonicalize.v8bf16(<8 x bfloat>) #0 +declare <12 x bfloat> @llvm.canonicalize.v12bf16(<12 x bfloat>) #0 +declare <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat>) #0 +declare <32 x bfloat> @llvm.canonicalize.v32bf16(<32 x bfloat>) #0 +declare <64 x bfloat> @llvm.canonicalize.v64bf16(<64 x bfloat>) #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +; GFX1250-LABEL: test_fold_canonicalize_undef_value_bf16: +; GFX1250: %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v0, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_undef_value_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat undef) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +; GFX1250-LABEL: v_test_canonicalize_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v0, v0, s[0:1] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: global_store_b16 v[0:1], v0, off +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_canonicalize_var_bf16(ptr addrspace(1) %out) #1 { + %val = load bfloat, ptr addrspace(1) %out + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val) + store bfloat %canonicalized, ptr addrspace(1) poison + ret void +} + +; GFX1250-LABEL: s_test_canonicalize_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_lshl_b32 s2, s2, 16 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e64 v0, s2, s2 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @s_test_canonicalize_var_bf16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 { + %val = bitcast i16 %val.arg to bfloat + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +; GFX1250-LABEL: v_test_canonicalize_build_vector_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <2 x bfloat> @v_test_canonicalize_build_vector_v2bf16(bfloat %lo, bfloat %hi) #1 { + %ins0 = insertelement <2 x bfloat> poison, bfloat %lo, i32 0 + %ins1 = insertelement <2 x bfloat> %ins0, bfloat %hi, i32 1 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %ins1) + ret <2 x bfloat> %canonicalized +} + + +; GFX1250-LABEL: v_test_canonicalize_fabs_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_canonicalize_fabs_var_bf16(ptr addrspace(1) %out) #1 { + %val = load bfloat, ptr addrspace(1) %out + %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val) + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fabs) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + + +; GFX1250-LABEL: v_test_canonicalize_fneg_fabs_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_bf16(ptr addrspace(1) %out) #1 { + %val = load bfloat, ptr addrspace(1) %out + %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val) + %val.fabs.fneg = fneg bfloat %val.fabs + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fabs.fneg) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +; GFX1250-LABEL: v_test_canonicalize_fneg_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_canonicalize_fneg_var_bf16(ptr addrspace(1) %out) #1 { + %val = load bfloat, ptr addrspace(1) %out + %val.fneg = fneg bfloat %val + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fneg) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +; GFX1250-LABEL: v_test_no_denormals_canonicalize_fneg_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_bf16(ptr addrspace(1) %out) #2 { + %val = load bfloat, ptr addrspace(1) %out + %val.fneg = fneg bfloat %val + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fneg) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +; GFX1250-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_bf16(ptr addrspace(1) %out) #2 { + %val = load bfloat, ptr addrspace(1) %out + %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val) + %val.fabs.fneg = fneg bfloat %val.fabs + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fabs.fneg) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +; GFX1250-LABEL: test_fold_canonicalize_p0_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + define amdgpu_kernel void @test_fold_canonicalize_p0_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0.0) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_n0_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +; GFX1250-NEXT: .Lfunc_end10: +define amdgpu_kernel void @test_fold_canonicalize_n0_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat -0.0) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_p1_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f80 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_p1_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 1.0) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_n1_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbf80 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +; GFX1250-NEXT: .Lfunc_end12: +define amdgpu_kernel void @test_fold_canonicalize_n1_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat -1.0) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_literal_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4180 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_literal_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 16.0) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal0_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR03FF) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal0_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_bf16(ptr addrspace(1) %out) #3 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR03FF) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal1_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR83FF) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal1_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_bf16(ptr addrspace(1) %out) #3 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR83FF) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_qnan_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_qnan_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR7C00) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg1_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 -1 to bfloat)) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg2_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 -2 to bfloat)) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_snan0_value_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c01 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_snan0_value_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR7C01) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_snan1_value_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7dff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_snan1_value_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR7DFF) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_snan2_value_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffdff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_snan2_value_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xRFDFF) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_snan3_value_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffc01 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_snan3_value_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xRFC01) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: v_test_canonicalize_var_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_canonicalize_var_v2bf16(ptr addrspace(1) %out) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid + %val = load <2 x bfloat>, ptr addrspace(1) %gep + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %val) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: v_test_canonicalize_fabs_var_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2bf16(ptr addrspace(1) %out) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid + %val = load <2 x bfloat>, ptr addrspace(1) %gep + %val.fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %val) + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %val.fabs) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: v_test_canonicalize_fneg_fabs_var_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2bf16(ptr addrspace(1) %out) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid + %val = load <2 x bfloat>, ptr addrspace(1) %gep + %val.fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %val) + %val.fabs.fneg = fneg <2 x bfloat> %val.fabs + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %val.fabs.fneg) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: v_test_canonicalize_fneg_var_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2bf16(ptr addrspace(1) %out) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid + %val = load <2 x bfloat>, ptr addrspace(1) %gep + %fneg.val = fneg <2 x bfloat> %val + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %fneg.val) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: s_test_canonicalize_var_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX1250-NEXT: s_lshl_b32 s2, s2, 16 +; GFX1250-NEXT: v_max_num_f32_e64 v0, s3, s3 +; GFX1250-NEXT: v_max_num_f32_e64 v1, s2, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v1, v0 +; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @s_test_canonicalize_var_v2bf16(ptr addrspace(1) %out, i32 zeroext %val.arg) #1 { + %val = bitcast i32 %val.arg to <2 x bfloat> + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %val) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_p0_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_p0_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> zeroinitializer) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_n0_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_n0_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_p1_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f803f80 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_p1_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_n1_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbf80bf80 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_n1_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_literal_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41804180 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_literal_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal0_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2bf16(ptr addrspace(1) %out) #3 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal1_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2bf16(ptr addrspace(1) %out) #3 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_qnan_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_qnan_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg1_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc07fc0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> bitcast (i32 -1 to <2 x bfloat>)) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg2_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc07fc0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_snan0_value_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c017c01 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_snan1_value_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7dff7dff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_snan2_value_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfdfffdff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_snan3_value_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfc01fc01 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: v_test_canonicalize_var_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v0, v0, v0 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <3 x bfloat> @v_test_canonicalize_var_v3bf16(<3 x bfloat> %val) #1 { + %canonicalized = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> %val) + ret <3 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_var_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3 +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <4 x bfloat> @v_test_canonicalize_var_v4bf16(<4 x bfloat> %val) #1 { + %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %val) + ret <4 x bfloat> %canonicalized +} +; GFX1250-LABEL: s_test_canonicalize_undef_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @s_test_canonicalize_undef_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: v_test_canonicalize_reg_undef_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_movk_i32 s0, 0x7fc0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <2 x bfloat> @v_test_canonicalize_reg_undef_v2bf16(bfloat %val) #1 { + %vec = insertelement <2 x bfloat> poison, bfloat %val, i32 0 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) + ret <2 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_undef_reg_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_movk_i32 s0, 0x7fc0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_perm_b32 v0, v0, s0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <2 x bfloat> @v_test_canonicalize_undef_reg_v2bf16(bfloat %val) #1 { + %vec = insertelement <2 x bfloat> poison, bfloat %val, i32 1 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) + ret <2 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <2 x bfloat> @v_test_canonicalize_undef_lo_imm_hi_v2bf16() #1 { + %vec = insertelement <2 x bfloat> undef, bfloat 1.0, i32 1 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) + ret <2 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0x3f80 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <2 x bfloat> @v_test_canonicalize_imm_lo_undef_hi_v2bf16() #1 { + %vec = insertelement <2 x bfloat> undef, bfloat 1.0, i32 0 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) + ret <2 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_undef_lo_k_hi_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0x41800000 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <2 x bfloat> @v_test_canonicalize_undef_lo_k_hi_v2bf16() #1 { + %vec = insertelement <2 x bfloat> undef, bfloat 16.0, i32 1 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) + ret <2 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_k_lo_undef_hi_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0x4180 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <2 x bfloat> @v_test_canonicalize_k_lo_undef_hi_v2bf16() #1 { + %vec = insertelement <2 x bfloat> undef, bfloat 16.0, i32 0 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) + ret <2 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_reg_k_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_movk_i32 s0, 0x4000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <2 x bfloat> @v_test_canonicalize_reg_k_v2bf16(bfloat %val) #1 { + %vec0 = insertelement <2 x bfloat> poison, bfloat %val, i32 0 + %vec1 = insertelement <2 x bfloat> %vec0, bfloat 2.0, i32 1 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec1) + ret <2 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_k_reg_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_movk_i32 s0, 0x4000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_perm_b32 v0, v0, s0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <2 x bfloat> @v_test_canonicalize_k_reg_v2bf16(bfloat %val) #1 { + %vec0 = insertelement <2 x bfloat> poison, bfloat 2.0, i32 0 + %vec1 = insertelement <2 x bfloat> %vec0, bfloat %val, i32 1 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec1) + ret <2 x bfloat> %canonicalized +} +; GFX1250-LABEL: s_test_canonicalize_undef_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @s_test_canonicalize_undef_v4bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef) + store <4 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, 0x7fc07fc0 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_movk_i32 s0, 0x7fc0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <4 x bfloat> @v_test_canonicalize_reg_undef_undef_undef_v4bf16(bfloat %val) #1 { + %vec = insertelement <4 x bfloat> poison, bfloat %val, i32 0 + %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %vec) + ret <4 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0x7fc07fc0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <4 x bfloat> @v_test_canonicalize_reg_reg_undef_undef_v4bf16(bfloat %val0, bfloat %val1) #1 { + %vec0 = insertelement <4 x bfloat> poison, bfloat %val0, i32 0 + %vec1 = insertelement <4 x bfloat> %vec0, bfloat %val1, i32 1 + %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %vec1) + ret <4 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_movk_i32 s0, 0x7fc0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <4 x bfloat> @v_test_canonicalize_reg_undef_reg_reg_v4bf16(bfloat %val0, bfloat %val1, bfloat %val2) #1 { + %vec0 = insertelement <4 x bfloat> poison, bfloat %val0, i32 0 + %vec1 = insertelement <4 x bfloat> %vec0, bfloat %val1, i32 2 + %vec2 = insertelement <4 x bfloat> %vec1, bfloat %val2, i32 3 + %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %vec2) + ret <4 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_var_v6bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX1250-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4 +; GFX1250-NEXT: v_dual_max_num_f32 v5, v5, v5 :: v_dual_max_num_f32 v0, v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v2, v2, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <6 x bfloat> @v_test_canonicalize_var_v6bf16(<6 x bfloat> %val) #1 { + %canonicalized = call <6 x bfloat> @llvm.canonicalize.v6bf16(<6 x bfloat> %val) + ret <6 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_var_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX1250-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_max_num_f32 v5, v5, v5 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v6, v6, v6 +; GFX1250-NEXT: v_dual_max_num_f32 v7, v7, v7 :: v_dual_max_num_f32 v0, v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2 +; GFX1250-NEXT: v_max_num_f32_e32 v3, v3, v3 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v2, v2, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v3, v3, v4 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <8 x bfloat> @v_test_canonicalize_var_v8bf16(<8 x bfloat> %val) #1 { + %canonicalized = call <8 x bfloat> @llvm.canonicalize.v8bf16(<8 x bfloat> %val) + ret <8 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_var_v12bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX1250-NEXT: v_and_b32_e32 v8, 0xffff0000, v3 +; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v4 :: v_dual_lshlrev_b32 v3, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v10, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX1250-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v5, v5, v5 +; GFX1250-NEXT: v_dual_max_num_f32 v7, v7, v7 :: v_dual_max_num_f32 v8, v8, v8 +; GFX1250-NEXT: v_dual_max_num_f32 v9, v9, v9 :: v_dual_max_num_f32 v10, v10, v10 +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v11, v11, v11 +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2 +; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v11 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v2, v2, v9 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v4, v4, v7 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v5, v5, v6 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <12 x bfloat> @v_test_canonicalize_var_v12bf16(<12 x bfloat> %val) #1 { + %canonicalized = call <12 x bfloat> @llvm.canonicalize.v12bf16(<12 x bfloat> %val) + ret <12 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_var_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX1250-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 +; GFX1250-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_dual_max_num_f32 v8, v8, v8 :: v_dual_lshlrev_b32 v5, 16, v5 +; GFX1250-NEXT: v_dual_max_num_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v6, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v4 +; GFX1250-NEXT: v_and_b32_e32 v12, 0xffff0000, v3 +; GFX1250-NEXT: v_and_b32_e32 v13, 0xffff0000, v2 +; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v4 :: v_dual_lshlrev_b32 v3, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v14, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_dual_max_num_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX1250-NEXT: v_max_num_f32_e32 v6, v6, v6 +; GFX1250-NEXT: v_dual_max_num_f32 v10, v10, v10 :: v_dual_max_num_f32 v5, v5, v5 +; GFX1250-NEXT: v_dual_max_num_f32 v11, v11, v11 :: v_dual_max_num_f32 v12, v12, v12 +; GFX1250-NEXT: v_dual_max_num_f32 v13, v13, v13 :: v_dual_max_num_f32 v14, v14, v14 +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v15, v15, v15 +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2 +; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v15 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v14 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v2, v2, v13 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v4, v4, v11 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v5, v5, v10 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v6, v6, v9 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v7, v7, v8 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <16 x bfloat> @v_test_canonicalize_var_v16bf16(<16 x bfloat> %val) #1 { + %canonicalized = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> %val) + ret <16 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_var_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX1250-NEXT: v_and_b32_e32 v18, 0xffff0000, v13 +; GFX1250-NEXT: v_and_b32_e32 v20, 0xffff0000, v11 +; GFX1250-NEXT: v_and_b32_e32 v22, 0xffff0000, v9 +; GFX1250-NEXT: v_and_b32_e32 v24, 0xffff0000, v7 +; GFX1250-NEXT: v_dual_max_num_f32 v16, v16, v16 :: v_dual_lshlrev_b32 v15, 16, v15 +; GFX1250-NEXT: v_and_b32_e32 v17, 0xffff0000, v14 +; GFX1250-NEXT: v_dual_lshlrev_b32 v14, 16, v14 :: v_dual_lshlrev_b32 v13, 16, v13 +; GFX1250-NEXT: v_max_num_f32_e32 v18, v18, v18 +; GFX1250-NEXT: v_and_b32_e32 v19, 0xffff0000, v12 +; GFX1250-NEXT: v_dual_lshlrev_b32 v12, 16, v12 :: v_dual_lshlrev_b32 v11, 16, v11 +; GFX1250-NEXT: v_max_num_f32_e32 v20, v20, v20 +; GFX1250-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; GFX1250-NEXT: v_dual_lshlrev_b32 v10, 16, v10 :: v_dual_lshlrev_b32 v9, 16, v9 +; GFX1250-NEXT: v_max_num_f32_e32 v22, v22, v22 +; GFX1250-NEXT: v_and_b32_e32 v23, 0xffff0000, v8 +; GFX1250-NEXT: v_dual_lshlrev_b32 v8, 16, v8 :: v_dual_lshlrev_b32 v7, 16, v7 +; GFX1250-NEXT: v_max_num_f32_e32 v24, v24, v24 +; GFX1250-NEXT: v_and_b32_e32 v25, 0xffff0000, v6 +; GFX1250-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v26, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-NEXT: v_and_b32_e32 v27, 0xffff0000, v4 +; GFX1250-NEXT: v_and_b32_e32 v28, 0xffff0000, v3 +; GFX1250-NEXT: v_and_b32_e32 v29, 0xffff0000, v2 +; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v4 :: v_dual_lshlrev_b32 v3, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v30, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v31, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_dual_max_num_f32 v15, v15, v15 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX1250-NEXT: v_dual_max_num_f32 v17, v17, v17 :: v_dual_max_num_f32 v14, v14, v14 +; GFX1250-NEXT: v_dual_max_num_f32 v13, v13, v13 :: v_dual_max_num_f32 v19, v19, v19 +; GFX1250-NEXT: v_dual_max_num_f32 v12, v12, v12 :: v_dual_max_num_f32 v11, v11, v11 +; GFX1250-NEXT: v_dual_max_num_f32 v21, v21, v21 :: v_dual_max_num_f32 v10, v10, v10 +; GFX1250-NEXT: v_dual_max_num_f32 v9, v9, v9 :: v_dual_max_num_f32 v23, v23, v23 +; GFX1250-NEXT: v_dual_max_num_f32 v8, v8, v8 :: v_dual_max_num_f32 v7, v7, v7 +; GFX1250-NEXT: v_dual_max_num_f32 v25, v25, v25 :: v_dual_max_num_f32 v6, v6, v6 +; GFX1250-NEXT: v_dual_max_num_f32 v26, v26, v26 :: v_dual_max_num_f32 v5, v5, v5 +; GFX1250-NEXT: v_dual_max_num_f32 v27, v27, v27 :: v_dual_max_num_f32 v28, v28, v28 +; GFX1250-NEXT: v_dual_max_num_f32 v29, v29, v29 :: v_dual_max_num_f32 v30, v30, v30 +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v31, v31, v31 +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2 +; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v31 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v30 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v2, v2, v29 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v3, v3, v28 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v4, v4, v27 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v5, v5, v26 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v6, v6, v25 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v7, v7, v24 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v8, v8, v23 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v9, v9, v22 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v10, v10, v21 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v11, v11, v20 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v12, v12, v19 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v13, v13, v18 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v14, v14, v17 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v15, v15, v16 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <32 x bfloat> @v_test_canonicalize_var_v32bf16(<32 x bfloat> %val) #1 { + %canonicalized = call <32 x bfloat> @llvm.canonicalize.v32bf16(<32 x bfloat> %val) + ret <32 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_var_v64bf16: +; GFX1250: %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: v_and_b32_e32 v81, 0xffff0000, v0 +; GFX1250-NEXT: v_and_b32_e32 v38, 0xffff0000, v24 +; GFX1250-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX1250-NEXT: v_and_b32_e32 v39, 0xffff0000, v23 +; GFX1250-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX1250-NEXT: v_and_b32_e32 v80, 0xffff0000, v6 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v82, 0xffff0000, v1 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v81, v81, v81 +; GFX1250-NEXT: v_and_b32_e32 v83, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX1250-NEXT: v_and_b32_e32 v34, 0xffff0000, v28 +; GFX1250-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX1250-NEXT: v_and_b32_e32 v35, 0xffff0000, v27 +; GFX1250-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX1250-NEXT: v_and_b32_e32 v36, 0xffff0000, v26 +; GFX1250-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX1250-NEXT: v_and_b32_e32 v48, 0xffff0000, v22 +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v82, v82, v82 +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v83, v83, v83 +; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v24, v24, v24 +; GFX1250-NEXT: v_max_num_f32_e32 v39, v39, v39 +; GFX1250-NEXT: v_dual_max_num_f32 v23, v23, v23 :: v_dual_max_num_f32 v48, v48, v48 +; GFX1250-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; GFX1250-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX1250-NEXT: v_and_b32_e32 v33, 0xffff0000, v29 +; GFX1250-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX1250-NEXT: v_and_b32_e32 v37, 0xffff0000, v25 +; GFX1250-NEXT: v_dual_lshlrev_b32 v25, 16, v25 :: v_dual_lshlrev_b32 v22, 16, v22 +; GFX1250-NEXT: v_and_b32_e32 v49, 0xffff0000, v21 +; GFX1250-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX1250-NEXT: v_and_b32_e32 v50, 0xffff0000, v20 +; GFX1250-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX1250-NEXT: v_and_b32_e32 v51, 0xffff0000, v19 +; GFX1250-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX1250-NEXT: v_and_b32_e32 v52, 0xffff0000, v18 +; GFX1250-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX1250-NEXT: v_and_b32_e32 v53, 0xffff0000, v17 +; GFX1250-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX1250-NEXT: v_and_b32_e32 v54, 0xffff0000, v16 +; GFX1250-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX1250-NEXT: v_and_b32_e32 v55, 0xffff0000, v15 +; GFX1250-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX1250-NEXT: v_and_b32_e32 v64, 0xffff0000, v14 +; GFX1250-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX1250-NEXT: v_and_b32_e32 v65, 0xffff0000, v13 +; GFX1250-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX1250-NEXT: v_and_b32_e32 v66, 0xffff0000, v12 +; GFX1250-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX1250-NEXT: v_and_b32_e32 v67, 0xffff0000, v11 +; GFX1250-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX1250-NEXT: v_and_b32_e32 v68, 0xffff0000, v10 +; GFX1250-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX1250-NEXT: v_and_b32_e32 v69, 0xffff0000, v9 +; GFX1250-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX1250-NEXT: v_and_b32_e32 v70, 0xffff0000, v8 +; GFX1250-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX1250-NEXT: v_and_b32_e32 v71, 0xffff0000, v7 +; GFX1250-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v81 +; GFX1250-NEXT: v_and_b32_e32 v81, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v82 +; GFX1250-NEXT: v_and_b32_e32 v82, 0xffff0000, v4 +; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v2, v2, v83 +; GFX1250-NEXT: v_and_b32_e32 v83, 0xffff0000, v3 +; GFX1250-NEXT: v_dual_max_num_f32 v32, v32, v32 :: v_dual_lshlrev_b32 v3, 16, v3 +; GFX1250-NEXT: v_dual_max_num_f32 v27, v27, v27 :: v_dual_max_num_f32 v36, v36, v36 +; GFX1250-NEXT: v_dual_max_num_f32 v26, v26, v26 :: v_dual_max_num_f32 v37, v37, v37 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v23, v23, v39 +; GFX1250-NEXT: v_dual_max_num_f32 v30, v30, v30 :: v_dual_max_num_f32 v33, v33, v33 +; GFX1250-NEXT: v_dual_max_num_f32 v29, v29, v29 :: v_dual_max_num_f32 v34, v34, v34 +; GFX1250-NEXT: v_dual_max_num_f32 v28, v28, v28 :: v_dual_max_num_f32 v35, v35, v35 +; GFX1250-NEXT: v_dual_max_num_f32 v25, v25, v25 :: v_dual_max_num_f32 v38, v38, v38 +; GFX1250-NEXT: v_dual_max_num_f32 v22, v22, v22 :: v_dual_max_num_f32 v49, v49, v49 +; GFX1250-NEXT: v_dual_max_num_f32 v21, v21, v21 :: v_dual_max_num_f32 v50, v50, v50 +; GFX1250-NEXT: v_dual_max_num_f32 v20, v20, v20 :: v_dual_max_num_f32 v51, v51, v51 +; GFX1250-NEXT: v_dual_max_num_f32 v19, v19, v19 :: v_dual_max_num_f32 v52, v52, v52 +; GFX1250-NEXT: v_dual_max_num_f32 v18, v18, v18 :: v_dual_max_num_f32 v53, v53, v53 +; GFX1250-NEXT: v_dual_max_num_f32 v17, v17, v17 :: v_dual_max_num_f32 v54, v54, v54 +; GFX1250-NEXT: v_dual_max_num_f32 v16, v16, v16 :: v_dual_max_num_f32 v55, v55, v55 +; GFX1250-NEXT: v_dual_max_num_f32 v15, v15, v15 :: v_dual_max_num_f32 v64, v64, v64 +; GFX1250-NEXT: v_dual_max_num_f32 v14, v14, v14 :: v_dual_max_num_f32 v65, v65, v65 +; GFX1250-NEXT: v_dual_max_num_f32 v13, v13, v13 :: v_dual_max_num_f32 v66, v66, v66 +; GFX1250-NEXT: v_dual_max_num_f32 v12, v12, v12 :: v_dual_max_num_f32 v67, v67, v67 +; GFX1250-NEXT: v_dual_max_num_f32 v11, v11, v11 :: v_dual_max_num_f32 v68, v68, v68 +; GFX1250-NEXT: v_dual_max_num_f32 v10, v10, v10 :: v_dual_max_num_f32 v69, v69, v69 +; GFX1250-NEXT: v_dual_max_num_f32 v9, v9, v9 :: v_dual_max_num_f32 v70, v70, v70 +; GFX1250-NEXT: v_dual_max_num_f32 v8, v8, v8 :: v_dual_max_num_f32 v71, v71, v71 +; GFX1250-NEXT: v_dual_max_num_f32 v80, v80, v80 :: v_dual_max_num_f32 v81, v81, v81 +; GFX1250-NEXT: v_dual_max_num_f32 v82, v82, v82 :: v_dual_max_num_f32 v83, v83, v83 +; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4 +; GFX1250-NEXT: v_dual_max_num_f32 v5, v5, v5 :: v_dual_max_num_f32 v6, v6, v6 +; GFX1250-NEXT: v_max_num_f32_e32 v7, v7, v7 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v26, v26, v36 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v3, v3, v83 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v4, v4, v82 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v5, v5, v81 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v6, v6, v80 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v7, v7, v71 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v8, v8, v70 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v9, v9, v69 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v10, v10, v68 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v11, v11, v67 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v12, v12, v66 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v13, v13, v65 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v14, v14, v64 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v15, v15, v55 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v16, v16, v54 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v17, v17, v53 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v18, v18, v52 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v19, v19, v51 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v20, v20, v50 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v21, v21, v49 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v22, v22, v48 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v24, v24, v38 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v25, v25, v37 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v27, v27, v35 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v28, v28, v34 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v29, v29, v33 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v30, v30, v32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v39, 0xffff0000, v31 +; GFX1250-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_max_num_f32_e32 v36, v39, v39 +; GFX1250-NEXT: v_max_num_f32_e32 v31, v31, v31 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v31, v31, v36 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <64 x bfloat> @v_test_canonicalize_var_v64bf16(<64 x bfloat> %val) #1 { + %canonicalized = call <64 x bfloat> @llvm.canonicalize.v64bf16(<64 x bfloat> %val) + ret <64 x bfloat> %canonicalized +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } +attributes #3 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } From 32d932f3f761c995f03d9e8f1db90bfcea99375a Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Wed, 24 Sep 2025 00:15:40 +0100 Subject: [PATCH 3/4] Fix CostModel for bf16 --- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 2 +- llvm/test/Analysis/CostModel/AMDGPU/fadd.ll | 22 ++++----- llvm/test/Analysis/CostModel/AMDGPU/fmul.ll | 48 +++++++++---------- llvm/test/Analysis/CostModel/AMDGPU/fsub.ll | 26 +++++----- 4 files changed, 49 insertions(+), 49 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index b07e936c494f6..03d16fdd54c42 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -615,7 +615,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost( if (ST->has16BitInsts() && SLT == MVT::f16) NElts = (NElts + 1) / 2; - if (SLT == MVT::f32 || SLT == MVT::f16) + if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16) return LT.first * NElts * getFullRateInstrCost(); break; case ISD::FDIV: diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll index d9729479f7410..3538c07938530 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll @@ -162,22 +162,22 @@ define amdgpu_kernel void @fadd_f16() #0 { define amdgpu_kernel void @fadd_bf16() #0 { ; GFX1250-LABEL: 'fadd_bf16' -; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = fadd bfloat undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fadd <2 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fadd <3 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fadd <4 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fadd bfloat undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fadd <2 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fadd <3 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fadd <4 x bfloat> undef, undef ; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fadd <5 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16bf16 = fadd <16 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v17bf16 = fadd <17 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fadd <16 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fadd <17 x bfloat> undef, undef ; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; GFX1250-SIZE-LABEL: 'fadd_bf16' ; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fadd bfloat undef, undef ; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fadd <2 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = fadd <3 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = fadd <4 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = fadd <5 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = fadd <16 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v17bf16 = fadd <17 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fadd <3 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fadd <4 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fadd <5 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fadd <16 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fadd <17 x bfloat> undef, undef ; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void %bf16 = fadd bfloat undef, undef %v2bf16 = fadd <2 x bfloat> undef, undef diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll index 5315852d3225c..c022046fbf1df 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll @@ -162,13 +162,13 @@ define amdgpu_kernel void @fmul_f16() #0 { define amdgpu_kernel void @fmul_bf16() #0 { ; GFX9-LABEL: 'fmul_bf16' -; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = fmul bfloat undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef +; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef +; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef +; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef +; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef +; GFX9-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef +; GFX9-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef ; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fmul_bf16' @@ -182,23 +182,23 @@ define amdgpu_kernel void @fmul_bf16() #0 { ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX1250-LABEL: 'fmul_bf16' -; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = fmul bfloat undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef ; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef ; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX9-SIZE-LABEL: 'fmul_bf16' ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fmul_bf16' @@ -214,11 +214,11 @@ define amdgpu_kernel void @fmul_bf16() #0 { ; GFX1250-SIZE-LABEL: 'fmul_bf16' ; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef ; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef ; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void %bf16 = fmul bfloat undef, undef %v2bf16 = fmul <2 x bfloat> undef, undef diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll index 61929a64244d6..37f947d9c6341 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll @@ -6,7 +6,7 @@ ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16-SIZE,GFX90A-FASTF64-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,FASTF16-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,SLOWF64-SIZE %s -; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s +; RUN opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s ; END. define amdgpu_kernel void @fsub_f32() #0 { @@ -162,23 +162,23 @@ define amdgpu_kernel void @fsub_f16() #0 { define amdgpu_kernel void @fsub_bf16() #0 { ; GFX1250-LABEL: 'fsub_bf16' -; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = fsub bfloat undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2bf16 = fsub <2 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = fsub <3 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4bf16 = fsub <4 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v5bf16 = fsub <5 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v16bf16 = fsub <16 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v17bf16 = fsub <17 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fsub bfloat undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fsub <2 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fsub <3 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fsub <4 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fsub <5 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fsub <16 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fsub <17 x bfloat> undef, undef ; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX1250-SIZE-LABEL: 'fsub_bf16' ; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fsub bfloat undef, undef ; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fsub <2 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = fsub <3 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = fsub <4 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = fsub <5 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = fsub <16 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v17bf16 = fsub <17 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fsub <3 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fsub <4 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fsub <5 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fsub <16 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fsub <17 x bfloat> undef, undef ; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %bf16 = fsub bfloat undef, undef From 0b3633928018ffc4a0813150d5b61de197ca0b5c Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Thu, 25 Sep 2025 00:31:01 +0100 Subject: [PATCH 4/4] Latest fixes --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 1 - llvm/test/Analysis/CostModel/AMDGPU/fadd.ll | 242 ++++++++-------- llvm/test/Analysis/CostModel/AMDGPU/fma.ll | 240 ++++++++-------- llvm/test/Analysis/CostModel/AMDGPU/fmul.ll | 298 ++++++++++---------- llvm/test/Analysis/CostModel/AMDGPU/fsub.ll | 242 ++++++++-------- llvm/test/CodeGen/AMDGPU/bf16.ll | 115 ++++++-- 6 files changed, 606 insertions(+), 532 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 77f0c55100981..afefd01ffb3ba 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6688,7 +6688,6 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - VT.dump(); assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll index 3538c07938530..9b1495b35a89d 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll @@ -11,181 +11,181 @@ define amdgpu_kernel void @fadd_f32() #0 { ; GFX90A-FASTF64-LABEL: 'fadd_f32' -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fadd <2 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fadd <3 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fadd <2 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fadd <3 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> poison, poison ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; NOPACKEDF32-LABEL: 'fadd_f32' -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fadd <2 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fadd <3 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> undef, undef +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fadd <2 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fadd <3 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> poison, poison ; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-FASTF64-SIZE-LABEL: 'fadd_f32' -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fadd <2 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fadd <3 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> undef, undef +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fadd <2 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fadd <3 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> poison, poison ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; NOPACKEDF32-SIZE-LABEL: 'fadd_f32' -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fadd <2 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fadd <3 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> undef, undef +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fadd <2 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fadd <3 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> poison, poison ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f32 = fadd float undef, undef - %v2f32 = fadd <2 x float> undef, undef - %v3f32 = fadd <3 x float> undef, undef - %v4f32 = fadd <4 x float> undef, undef - %v5f32 = fadd <5 x float> undef, undef - %v8f32 = fadd <8 x float> undef, undef - %v9f32 = fadd <9 x float> undef, undef + %f32 = fadd float poison, poison + %v2f32 = fadd <2 x float> poison, poison + %v3f32 = fadd <3 x float> poison, poison + %v4f32 = fadd <4 x float> poison, poison + %v5f32 = fadd <5 x float> poison, poison + %v8f32 = fadd <8 x float> poison, poison + %v9f32 = fadd <9 x float> poison, poison ret void } define amdgpu_kernel void @fadd_f64() #0 { ; GFX90A-FASTF64-LABEL: 'fadd_f64' -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fadd double undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fadd <2 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fadd <3 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fadd <4 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fadd <5 x double> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fadd double poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fadd <2 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fadd <3 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fadd <4 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fadd <5 x double> poison, poison ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FASTF64-LABEL: 'fadd_f64' -; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fadd double undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fadd <2 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fadd <3 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fadd <4 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fadd <5 x double> undef, undef +; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fadd double poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fadd <2 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fadd <3 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fadd <4 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fadd <5 x double> poison, poison ; FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'fadd_f64' -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = fadd double undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fadd <2 x double> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fadd <3 x double> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fadd <4 x double> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fadd <5 x double> undef, undef +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = fadd double poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fadd <2 x double> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fadd <3 x double> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fadd <4 x double> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fadd <5 x double> poison, poison ; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-FASTF64-SIZE-LABEL: 'fadd_f64' -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fadd double undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fadd <2 x double> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fadd <3 x double> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fadd <4 x double> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fadd <5 x double> undef, undef +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fadd double poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fadd <2 x double> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fadd <3 x double> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fadd <4 x double> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fadd <5 x double> poison, poison ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; NOPACKEDF32-SIZE-LABEL: 'fadd_f64' -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fadd double undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fadd <2 x double> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fadd <3 x double> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fadd <4 x double> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fadd <5 x double> undef, undef +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fadd double poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fadd <2 x double> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fadd <3 x double> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fadd <4 x double> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fadd <5 x double> poison, poison ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f64 = fadd double undef, undef - %v2f64 = fadd <2 x double> undef, undef - %v3f64 = fadd <3 x double> undef, undef - %v4f64 = fadd <4 x double> undef, undef - %v5f64 = fadd <5 x double> undef, undef + %f64 = fadd double poison, poison + %v2f64 = fadd <2 x double> poison, poison + %v3f64 = fadd <3 x double> poison, poison + %v4f64 = fadd <4 x double> poison, poison + %v5f64 = fadd <5 x double> poison, poison ret void } define amdgpu_kernel void @fadd_f16() #0 { ; FASTF16-LABEL: 'fadd_f16' -; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> undef, undef +; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> poison, poison ; FASTF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'fadd_f16' -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fadd <16 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fadd <17 x half> undef, undef +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fadd <16 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fadd <17 x half> poison, poison ; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FASTF16-SIZE-LABEL: 'fadd_f16' -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> undef, undef +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> poison, poison ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOWF64-SIZE-LABEL: 'fadd_f16' -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fadd <16 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fadd <17 x half> undef, undef +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fadd <16 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fadd <17 x half> poison, poison ; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f16 = fadd half undef, undef - %v2f16 = fadd <2 x half> undef, undef - %v3f16 = fadd <3 x half> undef, undef - %v4f16 = fadd <4 x half> undef, undef - %v5f16 = fadd <5 x half> undef, undef - %v16f16 = fadd <16 x half> undef, undef - %v17f16 = fadd <17 x half> undef, undef + %f16 = fadd half poison, poison + %v2f16 = fadd <2 x half> poison, poison + %v3f16 = fadd <3 x half> poison, poison + %v4f16 = fadd <4 x half> poison, poison + %v5f16 = fadd <5 x half> poison, poison + %v16f16 = fadd <16 x half> poison, poison + %v17f16 = fadd <17 x half> poison, poison ret void } define amdgpu_kernel void @fadd_bf16() #0 { ; GFX1250-LABEL: 'fadd_bf16' -; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fadd bfloat undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fadd <2 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fadd <3 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fadd <4 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fadd <5 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fadd <16 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fadd <17 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fadd bfloat poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fadd <2 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fadd <3 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fadd <4 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fadd <5 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fadd <16 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fadd <17 x bfloat> poison, poison ; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; GFX1250-SIZE-LABEL: 'fadd_bf16' -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fadd bfloat undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fadd <2 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fadd <3 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fadd <4 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fadd <5 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fadd <16 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fadd <17 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fadd bfloat poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fadd <2 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fadd <3 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fadd <4 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fadd <5 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fadd <16 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fadd <17 x bfloat> poison, poison ; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void - %bf16 = fadd bfloat undef, undef - %v2bf16 = fadd <2 x bfloat> undef, undef - %v3bf16 = fadd <3 x bfloat> undef, undef - %v4bf16 = fadd <4 x bfloat> undef, undef - %v5bf16 = fadd <5 x bfloat> undef, undef - %v16bf16 = fadd <16 x bfloat> undef, undef - %v17bf16 = fadd <17 x bfloat> undef, undef + %bf16 = fadd bfloat poison, poison + %v2bf16 = fadd <2 x bfloat> poison, poison + %v3bf16 = fadd <3 x bfloat> poison, poison + %v4bf16 = fadd <4 x bfloat> poison, poison + %v5bf16 = fadd <5 x bfloat> poison, poison + %v16bf16 = fadd <16 x bfloat> poison, poison + %v17bf16 = fadd <17 x bfloat> poison, poison ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll index db2170af2c801..f34ee31bcf4ce 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll @@ -12,176 +12,176 @@ define void @fma_f16() { ; FAST-LABEL: 'fma_f16' -; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half poison, half poison, half poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison) ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fma_f16' -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f16 = call half @llvm.fma.f16(half poison, half poison, half poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison) ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FAST-SIZE-LABEL: 'fma_f16' -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half poison, half poison, half poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fma_f16' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half poison, half poison, half poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) - %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) - %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) - %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) - %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) - %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) - %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) + %f16 = call half @llvm.fma.f16(half poison, half poison, half poison) + %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison) + %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison) + %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison) + %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison) + %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison) + %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison) ret void } define void @fma_bf16() { ; FAST-LABEL: 'fma_bf16' -; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fma_bf16' -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FAST-SIZE-LABEL: 'fma_bf16' -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fma_bf16' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; GFX1250-LABEL: 'fma_bf16' -; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) -; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) -; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) -; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) -; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) -; GFX1250-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) -; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) ; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; GFX1250-SIZE-LABEL: 'fma_bf16' -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) ; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void - %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) - %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) - %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) - %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) - %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) - %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) - %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) + %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) + %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) + %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) + %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) + %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) + %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) + %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) ret void } define void @fma_f32() { ; SLOW-LABEL: 'fma_f32' -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fma.f32(float poison, float poison, float poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> poison, <2 x float> poison, <2 x float> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> poison, <3 x float> poison, <3 x float> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> poison, <4 x float> poison, <4 x float> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> poison, <5 x float> poison, <5 x float> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> poison, <8 x float> poison, <8 x float> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> poison, <9 x float> poison, <9 x float> poison) ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fma_f32' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float poison, float poison, float poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> poison, <2 x float> poison, <2 x float> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> poison, <3 x float> poison, <3 x float> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> poison, <4 x float> poison, <4 x float> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> poison, <5 x float> poison, <5 x float> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> poison, <8 x float> poison, <8 x float> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> poison, <9 x float> poison, <9 x float> poison) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) - %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) - %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) - %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) - %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) - %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) - %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) + %f32 = call float @llvm.fma.f32(float poison, float poison, float poison) + %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> poison, <2 x float> poison, <2 x float> poison) + %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> poison, <3 x float> poison, <3 x float> poison) + %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> poison, <4 x float> poison, <4 x float> poison) + %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> poison, <5 x float> poison, <5 x float> poison) + %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> poison, <8 x float> poison, <8 x float> poison) + %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> poison, <9 x float> poison, <9 x float> poison) ret void } define void @fma_f64() { ; SLOW-LABEL: 'fma_f64' -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.fma.f64(double poison, double poison, double poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> poison, <2 x double> poison, <2 x double> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> poison, <3 x double> poison, <3 x double> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> poison, <4 x double> poison, <4 x double> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> poison, <5 x double> poison, <5 x double> poison) ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fma_f64' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double poison, double poison, double poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> poison, <2 x double> poison, <2 x double> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> poison, <3 x double> poison, <3 x double> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> poison, <4 x double> poison, <4 x double> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> poison, <5 x double> poison, <5 x double> poison) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) - %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) - %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) - %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) - %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) + %f64 = call double @llvm.fma.f64(double poison, double poison, double poison) + %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> poison, <2 x double> poison, <2 x double> poison) + %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> poison, <3 x double> poison, <3 x double> poison) + %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> poison, <4 x double> poison, <4 x double> poison) + %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> poison, <5 x double> poison, <5 x double> poison) ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll index c022046fbf1df..c0b9cda23ea04 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll @@ -11,222 +11,222 @@ define amdgpu_kernel void @fmul_f32() #0 { ; GFX90A-FASTF64-LABEL: 'fmul_f32' -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fmul <2 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fmul <3 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fmul <2 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fmul <3 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> poison, poison ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; F32-LABEL: 'fmul_f32' -; F32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fmul <2 x float> undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fmul <3 x float> undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> undef, undef +; F32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float poison, poison +; F32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fmul <2 x float> poison, poison +; F32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fmul <3 x float> poison, poison +; F32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> poison, poison +; F32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> poison, poison +; F32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> poison, poison +; F32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> poison, poison ; F32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-SIZE-LABEL: 'fmul_f32' -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fmul <2 x float> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fmul <3 x float> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> undef, undef +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fmul <2 x float> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fmul <3 x float> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> poison, poison ; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE-LABEL: 'fmul_f32' -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fmul <2 x float> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fmul <3 x float> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> undef, undef +; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fmul <2 x float> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fmul <3 x float> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> poison, poison ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f32 = fmul float undef, undef - %v2f32 = fmul <2 x float> undef, undef - %v3f32 = fmul <3 x float> undef, undef - %v4f32 = fmul <4 x float> undef, undef - %v5f32 = fmul <5 x float> undef, undef - %v8f32 = fmul <8 x float> undef, undef - %v9f32 = fmul <9 x float> undef, undef + %f32 = fmul float poison, poison + %v2f32 = fmul <2 x float> poison, poison + %v3f32 = fmul <3 x float> poison, poison + %v4f32 = fmul <4 x float> poison, poison + %v5f32 = fmul <5 x float> poison, poison + %v8f32 = fmul <8 x float> poison, poison + %v9f32 = fmul <9 x float> poison, poison ret void } define amdgpu_kernel void @fmul_f64() #0 { ; GFX90A-FASTF64-LABEL: 'fmul_f64' -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fmul double undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fmul <2 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fmul <3 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fmul <4 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fmul <5 x double> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fmul double poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fmul <2 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fmul <3 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fmul <4 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fmul <5 x double> poison, poison ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FASTF64-LABEL: 'fmul_f64' -; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fmul double undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fmul <2 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fmul <3 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fmul <4 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fmul <5 x double> undef, undef +; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fmul double poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fmul <2 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fmul <3 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fmul <4 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fmul <5 x double> poison, poison ; FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fmul_f64' -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = fmul double undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fmul <2 x double> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fmul <3 x double> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fmul <4 x double> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fmul <5 x double> undef, undef +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = fmul double poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fmul <2 x double> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fmul <3 x double> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fmul <4 x double> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fmul <5 x double> poison, poison ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-SIZE-LABEL: 'fmul_f64' -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fmul double undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fmul <2 x double> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fmul <3 x double> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fmul <4 x double> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fmul <5 x double> undef, undef +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fmul double poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fmul <2 x double> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fmul <3 x double> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fmul <4 x double> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fmul <5 x double> poison, poison ; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE-LABEL: 'fmul_f64' -; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fmul double undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fmul <2 x double> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fmul <3 x double> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fmul <4 x double> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fmul <5 x double> undef, undef +; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fmul double poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fmul <2 x double> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fmul <3 x double> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fmul <4 x double> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fmul <5 x double> poison, poison ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f64 = fmul double undef, undef - %v2f64 = fmul <2 x double> undef, undef - %v3f64 = fmul <3 x double> undef, undef - %v4f64 = fmul <4 x double> undef, undef - %v5f64 = fmul <5 x double> undef, undef + %f64 = fmul double poison, poison + %v2f64 = fmul <2 x double> poison, poison + %v3f64 = fmul <3 x double> poison, poison + %v4f64 = fmul <4 x double> poison, poison + %v5f64 = fmul <5 x double> poison, poison ret void } define amdgpu_kernel void @fmul_f16() #0 { ; GFX9-LABEL: 'fmul_f16' -; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> undef, undef +; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> poison, poison ; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fmul_f16' -; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fmul <16 x half> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fmul <17 x half> undef, undef +; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fmul <16 x half> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fmul <17 x half> poison, poison ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX9-SIZE-LABEL: 'fmul_f16' -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> undef, undef +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> poison, poison ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fmul_f16' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fmul <16 x half> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fmul <17 x half> undef, undef +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fmul <16 x half> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fmul <17 x half> poison, poison ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f16 = fmul half undef, undef - %v2f16 = fmul <2 x half> undef, undef - %v3f16 = fmul <3 x half> undef, undef - %v4f16 = fmul <4 x half> undef, undef - %v5f16 = fmul <5 x half> undef, undef - %v16f16 = fmul <16 x half> undef, undef - %v17f16 = fmul <17 x half> undef, undef + %f16 = fmul half poison, poison + %v2f16 = fmul <2 x half> poison, poison + %v3f16 = fmul <3 x half> poison, poison + %v4f16 = fmul <4 x half> poison, poison + %v5f16 = fmul <5 x half> poison, poison + %v16f16 = fmul <16 x half> poison, poison + %v17f16 = fmul <17 x half> poison, poison ret void } define amdgpu_kernel void @fmul_bf16() #0 { ; GFX9-LABEL: 'fmul_bf16' -; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison ; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fmul_bf16' -; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX1250-LABEL: 'fmul_bf16' -; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison ; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX9-SIZE-LABEL: 'fmul_bf16' -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fmul_bf16' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; GFX1250-SIZE-LABEL: 'fmul_bf16' -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison ; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void - %bf16 = fmul bfloat undef, undef - %v2bf16 = fmul <2 x bfloat> undef, undef - %v3bf16 = fmul <3 x bfloat> undef, undef - %v4bf16 = fmul <4 x bfloat> undef, undef - %v5bf16 = fmul <5 x bfloat> undef, undef - %v16bf16 = fmul <16 x bfloat> undef, undef - %v17bf16 = fmul <17 x bfloat> undef, undef + %bf16 = fmul bfloat poison, poison + %v2bf16 = fmul <2 x bfloat> poison, poison + %v3bf16 = fmul <3 x bfloat> poison, poison + %v4bf16 = fmul <4 x bfloat> poison, poison + %v5bf16 = fmul <5 x bfloat> poison, poison + %v16bf16 = fmul <16 x bfloat> poison, poison + %v17bf16 = fmul <17 x bfloat> poison, poison ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll index 37f947d9c6341..6b71603f70f6b 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll @@ -11,182 +11,182 @@ define amdgpu_kernel void @fsub_f32() #0 { ; GFX90A-FASTF64-LABEL: 'fsub_f32' -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fsub <2 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fsub <3 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fsub <2 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fsub <3 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> poison, poison ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; NOPACKEDF32-LABEL: 'fsub_f32' -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fsub <2 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fsub <3 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> undef, undef +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fsub <2 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fsub <3 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> poison, poison ; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-FASTF64-SIZE-LABEL: 'fsub_f32' -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fsub <2 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fsub <3 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> undef, undef +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fsub <2 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fsub <3 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> poison, poison ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; NOPACKEDF32-SIZE-LABEL: 'fsub_f32' -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fsub <2 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fsub <3 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> undef, undef +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fsub <2 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fsub <3 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> poison, poison ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f32 = fsub float undef, undef - %v2f32 = fsub <2 x float> undef, undef - %v3f32 = fsub <3 x float> undef, undef - %v4f32 = fsub <4 x float> undef, undef - %v5f32 = fsub <5 x float> undef, undef - %v8f32 = fsub <8 x float> undef, undef - %v9f32 = fsub <9 x float> undef, undef + %f32 = fsub float poison, poison + %v2f32 = fsub <2 x float> poison, poison + %v3f32 = fsub <3 x float> poison, poison + %v4f32 = fsub <4 x float> poison, poison + %v5f32 = fsub <5 x float> poison, poison + %v8f32 = fsub <8 x float> poison, poison + %v9f32 = fsub <9 x float> poison, poison ret void } define amdgpu_kernel void @fsub_f64() #0 { ; GFX90A-FASTF64-LABEL: 'fsub_f64' -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fsub double undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fsub <2 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fsub <3 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fsub <4 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fsub <5 x double> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fsub double poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fsub <2 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fsub <3 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fsub <4 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fsub <5 x double> poison, poison ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FASTF64-LABEL: 'fsub_f64' -; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fsub double undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fsub <2 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fsub <3 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fsub <4 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fsub <5 x double> undef, undef +; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fsub double poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fsub <2 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fsub <3 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fsub <4 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fsub <5 x double> poison, poison ; FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'fsub_f64' -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = fsub double undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fsub <2 x double> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fsub <3 x double> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fsub <4 x double> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fsub <5 x double> undef, undef +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = fsub double poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fsub <2 x double> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fsub <3 x double> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fsub <4 x double> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fsub <5 x double> poison, poison ; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-FASTF64-SIZE-LABEL: 'fsub_f64' -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fsub double undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fsub <2 x double> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fsub <3 x double> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fsub <4 x double> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fsub <5 x double> undef, undef +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fsub double poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fsub <2 x double> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fsub <3 x double> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fsub <4 x double> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fsub <5 x double> poison, poison ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; NOPACKEDF32-SIZE-LABEL: 'fsub_f64' -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fsub double undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fsub <2 x double> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fsub <3 x double> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fsub <4 x double> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fsub <5 x double> undef, undef +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fsub double poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fsub <2 x double> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fsub <3 x double> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fsub <4 x double> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fsub <5 x double> poison, poison ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f64 = fsub double undef, undef - %v2f64 = fsub <2 x double> undef, undef - %v3f64 = fsub <3 x double> undef, undef - %v4f64 = fsub <4 x double> undef, undef - %v5f64 = fsub <5 x double> undef, undef + %f64 = fsub double poison, poison + %v2f64 = fsub <2 x double> poison, poison + %v3f64 = fsub <3 x double> poison, poison + %v4f64 = fsub <4 x double> poison, poison + %v5f64 = fsub <5 x double> poison, poison ret void } define amdgpu_kernel void @fsub_f16() #0 { ; FASTF16-LABEL: 'fsub_f16' -; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> undef, undef +; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> poison, poison ; FASTF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'fsub_f16' -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fsub <16 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fsub <17 x half> undef, undef +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fsub <16 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fsub <17 x half> poison, poison ; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FASTF16-SIZE-LABEL: 'fsub_f16' -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> undef, undef +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> poison, poison ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOWF64-SIZE-LABEL: 'fsub_f16' -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fsub <16 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fsub <17 x half> undef, undef +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fsub <16 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fsub <17 x half> poison, poison ; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f16 = fsub half undef, undef - %v2f16 = fsub <2 x half> undef, undef - %v3f16 = fsub <3 x half> undef, undef - %v4f16 = fsub <4 x half> undef, undef - %v5f16 = fsub <5 x half> undef, undef - %v16f16 = fsub <16 x half> undef, undef - %v17f16 = fsub <17 x half> undef, undef + %f16 = fsub half poison, poison + %v2f16 = fsub <2 x half> poison, poison + %v3f16 = fsub <3 x half> poison, poison + %v4f16 = fsub <4 x half> poison, poison + %v5f16 = fsub <5 x half> poison, poison + %v16f16 = fsub <16 x half> poison, poison + %v17f16 = fsub <17 x half> poison, poison ret void } define amdgpu_kernel void @fsub_bf16() #0 { ; GFX1250-LABEL: 'fsub_bf16' -; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fsub bfloat undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fsub <2 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fsub <3 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fsub <4 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fsub <5 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fsub <16 x bfloat> undef, undef -; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fsub <17 x bfloat> undef, undef +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fsub bfloat poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fsub <2 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fsub <3 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fsub <4 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fsub <5 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fsub <16 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fsub <17 x bfloat> poison, poison ; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX1250-SIZE-LABEL: 'fsub_bf16' -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fsub bfloat undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fsub <2 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fsub <3 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fsub <4 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fsub <5 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fsub <16 x bfloat> undef, undef -; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fsub <17 x bfloat> undef, undef +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fsub bfloat poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fsub <2 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fsub <3 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fsub <4 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fsub <5 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fsub <16 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fsub <17 x bfloat> poison, poison ; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %bf16 = fsub bfloat undef, undef - %v2bf16 = fsub <2 x bfloat> undef, undef - %v3bf16 = fsub <3 x bfloat> undef, undef - %v4bf16 = fsub <4 x bfloat> undef, undef - %v5bf16 = fsub <5 x bfloat> undef, undef - %v16bf16 = fsub <16 x bfloat> undef, undef - %v17bf16 = fsub <17 x bfloat> undef, undef + %bf16 = fsub bfloat poison, poison + %v2bf16 = fsub <2 x bfloat> poison, poison + %v3bf16 = fsub <3 x bfloat> poison, poison + %v4bf16 = fsub <4 x bfloat> poison, poison + %v5bf16 = fsub <5 x bfloat> poison, poison + %v16bf16 = fsub <16 x bfloat> poison, poison + %v17bf16 = fsub <17 x bfloat> poison, poison ret void } diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index e10141f9ba809..0490e5a19b4b7 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -7,14 +7,10 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16 -<<<<<<< HEAD ; xUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250TRUE16 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250FAKE16 ; FIXME: real-true16 version of gfx1250 test fails -======= -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 | FileCheck %s -check-prefixes=GFX1250 ->>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops) define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GCN-LABEL: test_load_store: @@ -49042,6 +49038,9 @@ declare bfloat @llvm.fma.bf16(bfloat, bfloat, bfloat) declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>) declare <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>) declare <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>) +declare <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>) +declare <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat>, <16 x bfloat>, <16 x bfloat>) +declare <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat>, <32 x bfloat>, <32 x bfloat>) define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GCN-LABEL: v_fma_bf16: @@ -49363,10 +49362,7 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] -<<<<<<< HEAD ; -======= ->>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops) ; GFX1250-LABEL: v_fma_v2bf16: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -49641,7 +49637,6 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] -<<<<<<< HEAD ; ; GFX1250-LABEL: v_fma_v3bf16: ; GFX1250: ; %bb.0: @@ -49650,15 +49645,6 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4 ; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] -======= -; GFX1250-LABEL: v_fma_v3bf16: -; GFX1250: %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4 -; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] ->>>>>>> cc3762e87c75 (Add testing coverage - part I) %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) ret <3 x bfloat> %op } @@ -49993,10 +49979,7 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] -<<<<<<< HEAD ; -======= ->>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops) ; GFX1250-LABEL: v_fma_v4bf16: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -50008,6 +49991,98 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ret <4 x bfloat> %op } +; GFX1250-LABEL: v_fma_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v4, v8 +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v5, v9 +; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v6, v10 +; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v7, v11 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) { + %op = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) + ret <8 x bfloat> %op +} + +; GFX1250-LABEL: v_fma_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v8, v16 +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v9, v17 +; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v10, v18 +; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v11, v19 +; GFX1250-NEXT: v_pk_fma_bf16 v4, v4, v12, v20 +; GFX1250-NEXT: v_pk_fma_bf16 v5, v5, v13, v21 +; GFX1250-NEXT: v_pk_fma_bf16 v6, v6, v14, v22 +; GFX1250-NEXT: v_pk_fma_bf16 v7, v7, v15, v23 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c) { + %op = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c) + ret <16 x bfloat> %op +} + +; GFX1250-LABEL: v_fma_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x10 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 offset:64 +; GFX1250-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX1250-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX1250-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX1250-NEXT: scratch_load_b32 v35, off, s32 offset:16 +; GFX1250-NEXT: scratch_load_b32 v36, off, s32 offset:20 +; GFX1250-NEXT: scratch_load_b32 v37, off, s32 offset:24 +; GFX1250-NEXT: scratch_load_b32 v38, off, s32 offset:28 +; GFX1250-NEXT: scratch_load_b32 v39, off, s32 offset:32 +; GFX1250-NEXT: scratch_load_b32 v48, off, s32 offset:36 +; GFX1250-NEXT: scratch_load_b32 v49, off, s32 offset:40 +; GFX1250-NEXT: scratch_load_b32 v50, off, s32 offset:44 +; GFX1250-NEXT: scratch_load_b32 v51, off, s32 offset:48 +; GFX1250-NEXT: scratch_load_b32 v52, off, s32 offset:52 +; GFX1250-NEXT: scratch_load_b32 v53, off, s32 offset:56 +; GFX1250-NEXT: scratch_load_b32 v54, off, s32 offset:60 +; GFX1250-NEXT: scratch_load_b32 v55, off, s32 +; GFX1250-NEXT: s_wait_loadcnt 0xf +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v16, v32 +; GFX1250-NEXT: s_wait_loadcnt 0xe +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v17, v33 +; GFX1250-NEXT: s_wait_loadcnt 0xd +; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v18, v34 +; GFX1250-NEXT: s_wait_loadcnt 0xc +; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v19, v35 +; GFX1250-NEXT: s_wait_loadcnt 0xb +; GFX1250-NEXT: v_pk_fma_bf16 v4, v4, v20, v36 +; GFX1250-NEXT: s_wait_loadcnt 0xa +; GFX1250-NEXT: v_pk_fma_bf16 v5, v5, v21, v37 +; GFX1250-NEXT: s_wait_loadcnt 0x9 +; GFX1250-NEXT: v_pk_fma_bf16 v6, v6, v22, v38 +; GFX1250-NEXT: s_wait_loadcnt 0x8 +; GFX1250-NEXT: v_pk_fma_bf16 v7, v7, v23, v39 +; GFX1250-NEXT: s_wait_loadcnt 0x7 +; GFX1250-NEXT: v_pk_fma_bf16 v8, v8, v24, v48 +; GFX1250-NEXT: s_wait_loadcnt 0x6 +; GFX1250-NEXT: v_pk_fma_bf16 v9, v9, v25, v49 +; GFX1250-NEXT: s_wait_loadcnt 0x5 +; GFX1250-NEXT: v_pk_fma_bf16 v10, v10, v26, v50 +; GFX1250-NEXT: s_wait_loadcnt 0x4 +; GFX1250-NEXT: v_pk_fma_bf16 v11, v11, v27, v51 +; GFX1250-NEXT: s_wait_loadcnt 0x3 +; GFX1250-NEXT: v_pk_fma_bf16 v12, v12, v28, v52 +; GFX1250-NEXT: s_wait_loadcnt 0x2 +; GFX1250-NEXT: v_pk_fma_bf16 v13, v13, v29, v53 +; GFX1250-NEXT: s_wait_loadcnt 0x1 +; GFX1250-NEXT: v_pk_fma_bf16 v14, v14, v30, v54 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v15, v15, v55, v31 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c) { + %op = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c) + ret <32 x bfloat> %op +} + declare bfloat @llvm.fmuladd.bf16(bfloat, bfloat, bfloat) declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>) declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)