diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 3e2b2c3510569..03d16fdd54c42 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -607,13 +607,15 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost( case ISD::FSUB: if (ST->hasPackedFP32Ops() && SLT == MVT::f32) NElts = (NElts + 1) / 2; + if (ST->hasBF16PackedInsts() && SLT == MVT::bf16) + NElts = (NElts + 1) / 2; if (SLT == MVT::f64) return LT.first * NElts * get64BitInstrCost(CostKind); if (ST->has16BitInsts() && SLT == MVT::f16) NElts = (NElts + 1) / 2; - if (SLT == MVT::f32 || SLT == MVT::f16) + if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16) return LT.first * NElts * getFullRateInstrCost(); break; case ISD::FDIV: @@ -746,7 +748,9 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; - if ((ST->hasVOP3PInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) || + if ((ST->hasVOP3PInsts() && + (SLT == MVT::f16 || SLT == MVT::i16 || + (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) || (ST->hasPackedFP32Ops() && SLT == MVT::f32)) NElts = (NElts + 1) / 2; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a53beaa2b6f91..afefd01ffb3ba 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -851,6 +851,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16}, Custom); + if (Subtarget->hasBF16PackedInsts()) { + for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16}) + // Split vector operations. + setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE}, + VT, Custom); + } + if (Subtarget->hasPackedFP32Ops()) { setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG}, MVT::v2f32, Legal); @@ -6621,10 +6628,12 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || - VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || - VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || - VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); + assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 || + VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || + VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 || + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 || + VT == MVT::v32bf16); auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0); diff --git a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll index 7ac4db3119210..904db9064a369 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll @@ -3,11 +3,13 @@ ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s +; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=GFX1250 %s ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s +; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=GFX1250-SIZE %s define void @canonicalize_f16() { ; BASE-LABEL: 'canonicalize_f16' @@ -141,6 +143,16 @@ define void @canonicalize_bf16() { ; GFX10-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef) ; GFX10-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; +; GFX1250-LABEL: 'canonicalize_bf16' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; ; BASE-SIZE-LABEL: 'canonicalize_bf16' ; BASE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) ; BASE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) @@ -181,6 +193,15 @@ define void @canonicalize_bf16() { ; GFX10-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef) ; GFX10-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; +; GFX1250-SIZE-LABEL: 'canonicalize_bf16' +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) #1 %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) #1 %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef) #1 @@ -203,6 +224,17 @@ define void @canonicalize_f32() { ; ALL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef) ; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; +; GFX1250-LABEL: 'canonicalize_f32' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; ; ALL-SIZE-LABEL: 'canonicalize_f32' ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) @@ -214,6 +246,16 @@ define void @canonicalize_f32() { ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; +; GFX1250-SIZE-LABEL: 'canonicalize_f32': +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void %f32 = call float @llvm.canonicalize.f32(float undef) #1 %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) #1 %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef) #1 @@ -236,6 +278,16 @@ define void @canonicalize_f64() { ; ALL-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) ; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; +; GFX1250-LABEL: 'canonicalize_f64' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; ; ALL-SIZE-LABEL: 'canonicalize_f64' ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) @@ -245,6 +297,16 @@ define void @canonicalize_f64() { ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; GFX1250-SIZE-LABEL: 'canonicalize_f64' +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %f64 = call double @llvm.canonicalize.f64(double undef) #1 %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) #1 @@ -255,9 +317,3 @@ define void @canonicalize_f64() { %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) #1 ret void } - - - - - - diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll index 55994d865fa6c..9b1495b35a89d 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll @@ -2,159 +2,190 @@ ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,GFX90A-FASTF64 %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32,FASTF16,FASTF64 %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32,SLOWF64 %s +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16-SIZE,GFX90A-FASTF64-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,FASTF16-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,SLOWF64-SIZE %s +; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s ; END. define amdgpu_kernel void @fadd_f32() #0 { ; GFX90A-FASTF64-LABEL: 'fadd_f32' -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fadd <2 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fadd <3 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fadd <2 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fadd <3 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> poison, poison ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; NOPACKEDF32-LABEL: 'fadd_f32' -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fadd <2 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fadd <3 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> undef, undef +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fadd <2 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fadd <3 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> poison, poison ; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-FASTF64-SIZE-LABEL: 'fadd_f32' -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fadd <2 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fadd <3 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> undef, undef +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fadd <2 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fadd <3 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> poison, poison ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; NOPACKEDF32-SIZE-LABEL: 'fadd_f32' -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fadd <2 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fadd <3 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> undef, undef +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fadd <2 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fadd <3 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> poison, poison ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f32 = fadd float undef, undef - %v2f32 = fadd <2 x float> undef, undef - %v3f32 = fadd <3 x float> undef, undef - %v4f32 = fadd <4 x float> undef, undef - %v5f32 = fadd <5 x float> undef, undef - %v8f32 = fadd <8 x float> undef, undef - %v9f32 = fadd <9 x float> undef, undef + %f32 = fadd float poison, poison + %v2f32 = fadd <2 x float> poison, poison + %v3f32 = fadd <3 x float> poison, poison + %v4f32 = fadd <4 x float> poison, poison + %v5f32 = fadd <5 x float> poison, poison + %v8f32 = fadd <8 x float> poison, poison + %v9f32 = fadd <9 x float> poison, poison ret void } define amdgpu_kernel void @fadd_f64() #0 { ; GFX90A-FASTF64-LABEL: 'fadd_f64' -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fadd double undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fadd <2 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fadd <3 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fadd <4 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fadd <5 x double> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fadd double poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fadd <2 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fadd <3 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fadd <4 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fadd <5 x double> poison, poison ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FASTF64-LABEL: 'fadd_f64' -; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fadd double undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fadd <2 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fadd <3 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fadd <4 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fadd <5 x double> undef, undef +; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fadd double poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fadd <2 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fadd <3 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fadd <4 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fadd <5 x double> poison, poison ; FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'fadd_f64' -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = fadd double undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fadd <2 x double> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fadd <3 x double> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fadd <4 x double> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fadd <5 x double> undef, undef +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = fadd double poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fadd <2 x double> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fadd <3 x double> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fadd <4 x double> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fadd <5 x double> poison, poison ; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-FASTF64-SIZE-LABEL: 'fadd_f64' -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fadd double undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fadd <2 x double> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fadd <3 x double> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fadd <4 x double> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fadd <5 x double> undef, undef +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fadd double poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fadd <2 x double> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fadd <3 x double> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fadd <4 x double> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fadd <5 x double> poison, poison ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; NOPACKEDF32-SIZE-LABEL: 'fadd_f64' -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fadd double undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fadd <2 x double> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fadd <3 x double> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fadd <4 x double> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fadd <5 x double> undef, undef +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fadd double poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fadd <2 x double> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fadd <3 x double> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fadd <4 x double> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fadd <5 x double> poison, poison ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f64 = fadd double undef, undef - %v2f64 = fadd <2 x double> undef, undef - %v3f64 = fadd <3 x double> undef, undef - %v4f64 = fadd <4 x double> undef, undef - %v5f64 = fadd <5 x double> undef, undef + %f64 = fadd double poison, poison + %v2f64 = fadd <2 x double> poison, poison + %v3f64 = fadd <3 x double> poison, poison + %v4f64 = fadd <4 x double> poison, poison + %v5f64 = fadd <5 x double> poison, poison ret void } define amdgpu_kernel void @fadd_f16() #0 { ; FASTF16-LABEL: 'fadd_f16' -; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> undef, undef +; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> poison, poison ; FASTF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'fadd_f16' -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fadd <16 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fadd <17 x half> undef, undef +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fadd <16 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fadd <17 x half> poison, poison ; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FASTF16-SIZE-LABEL: 'fadd_f16' -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> undef, undef +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> poison, poison ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOWF64-SIZE-LABEL: 'fadd_f16' -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fadd <16 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fadd <17 x half> undef, undef +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fadd <16 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fadd <17 x half> poison, poison ; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f16 = fadd half undef, undef - %v2f16 = fadd <2 x half> undef, undef - %v3f16 = fadd <3 x half> undef, undef - %v4f16 = fadd <4 x half> undef, undef - %v5f16 = fadd <5 x half> undef, undef - %v16f16 = fadd <16 x half> undef, undef - %v17f16 = fadd <17 x half> undef, undef + %f16 = fadd half poison, poison + %v2f16 = fadd <2 x half> poison, poison + %v3f16 = fadd <3 x half> poison, poison + %v4f16 = fadd <4 x half> poison, poison + %v5f16 = fadd <5 x half> poison, poison + %v16f16 = fadd <16 x half> poison, poison + %v17f16 = fadd <17 x half> poison, poison + ret void +} + +define amdgpu_kernel void @fadd_bf16() #0 { +; GFX1250-LABEL: 'fadd_bf16' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fadd bfloat poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fadd <2 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fadd <3 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fadd <4 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fadd <5 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fadd <16 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fadd <17 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; GFX1250-SIZE-LABEL: 'fadd_bf16' +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fadd bfloat poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fadd <2 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fadd <3 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fadd <4 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fadd <5 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fadd <16 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fadd <17 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void + %bf16 = fadd bfloat poison, poison + %v2bf16 = fadd <2 x bfloat> poison, poison + %v3bf16 = fadd <3 x bfloat> poison, poison + %v4bf16 = fadd <4 x bfloat> poison, poison + %v5bf16 = fadd <5 x bfloat> poison, poison + %v16bf16 = fadd <16 x bfloat> poison, poison + %v17bf16 = fadd <17 x bfloat> poison, poison ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll index 2ff9d4f7f5e38..f34ee31bcf4ce 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll @@ -2,166 +2,186 @@ ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW %s +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=SLOW-SIZE %s +; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s define void @fma_f16() { ; FAST-LABEL: 'fma_f16' -; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half poison, half poison, half poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison) ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fma_f16' -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f16 = call half @llvm.fma.f16(half poison, half poison, half poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison) ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FAST-SIZE-LABEL: 'fma_f16' -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half poison, half poison, half poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fma_f16' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half poison, half poison, half poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) - %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) - %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) - %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) - %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) - %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) - %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) + %f16 = call half @llvm.fma.f16(half poison, half poison, half poison) + %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison) + %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison) + %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison) + %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison) + %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison) + %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison) ret void } define void @fma_bf16() { ; FAST-LABEL: 'fma_bf16' -; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fma_bf16' -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FAST-SIZE-LABEL: 'fma_bf16' -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fma_bf16' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) - %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) - %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) - %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) - %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) - %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) - %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) +; GFX1250-LABEL: 'fma_bf16' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; GFX1250-SIZE-LABEL: 'fma_bf16' +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void + %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) + %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) + %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) + %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) + %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) + %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) + %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) ret void } define void @fma_f32() { ; SLOW-LABEL: 'fma_f32' -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fma.f32(float poison, float poison, float poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> poison, <2 x float> poison, <2 x float> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> poison, <3 x float> poison, <3 x float> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> poison, <4 x float> poison, <4 x float> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> poison, <5 x float> poison, <5 x float> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> poison, <8 x float> poison, <8 x float> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> poison, <9 x float> poison, <9 x float> poison) ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fma_f32' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float poison, float poison, float poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> poison, <2 x float> poison, <2 x float> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> poison, <3 x float> poison, <3 x float> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> poison, <4 x float> poison, <4 x float> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> poison, <5 x float> poison, <5 x float> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> poison, <8 x float> poison, <8 x float> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> poison, <9 x float> poison, <9 x float> poison) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) - %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) - %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) - %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) - %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) - %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) - %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) + %f32 = call float @llvm.fma.f32(float poison, float poison, float poison) + %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> poison, <2 x float> poison, <2 x float> poison) + %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> poison, <3 x float> poison, <3 x float> poison) + %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> poison, <4 x float> poison, <4 x float> poison) + %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> poison, <5 x float> poison, <5 x float> poison) + %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> poison, <8 x float> poison, <8 x float> poison) + %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> poison, <9 x float> poison, <9 x float> poison) ret void } define void @fma_f64() { ; SLOW-LABEL: 'fma_f64' -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.fma.f64(double poison, double poison, double poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> poison, <2 x double> poison, <2 x double> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> poison, <3 x double> poison, <3 x double> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> poison, <4 x double> poison, <4 x double> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> poison, <5 x double> poison, <5 x double> poison) ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fma_f64' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double poison, double poison, double poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> poison, <2 x double> poison, <2 x double> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> poison, <3 x double> poison, <3 x double> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> poison, <4 x double> poison, <4 x double> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> poison, <5 x double> poison, <5 x double> poison) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) - %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) - %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) - %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) - %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) + %f64 = call double @llvm.fma.f64(double poison, double poison, double poison) + %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> poison, <2 x double> poison, <2 x double> poison) + %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> poison, <3 x double> poison, <3 x double> poison) + %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> poison, <4 x double> poison, <4 x double> poison) + %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> poison, <5 x double> poison, <5 x double> poison) ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll index adc4eea309a58..c0b9cda23ea04 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll @@ -2,210 +2,231 @@ ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX9,GFX90A-FASTF64 %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX9,F32,FASTF64 %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=F32,SLOW %s +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX9-SIZE,GFX90A-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,SLOW-SIZE %s +; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s ; END. define amdgpu_kernel void @fmul_f32() #0 { ; GFX90A-FASTF64-LABEL: 'fmul_f32' -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fmul <2 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fmul <3 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fmul <2 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fmul <3 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> poison, poison ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; F32-LABEL: 'fmul_f32' -; F32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fmul <2 x float> undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fmul <3 x float> undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> undef, undef +; F32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float poison, poison +; F32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fmul <2 x float> poison, poison +; F32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fmul <3 x float> poison, poison +; F32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> poison, poison +; F32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> poison, poison +; F32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> poison, poison +; F32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> poison, poison ; F32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-SIZE-LABEL: 'fmul_f32' -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fmul <2 x float> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fmul <3 x float> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> undef, undef +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fmul <2 x float> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fmul <3 x float> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> poison, poison ; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE-LABEL: 'fmul_f32' -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fmul <2 x float> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fmul <3 x float> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> undef, undef +; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fmul <2 x float> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fmul <3 x float> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> poison, poison ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f32 = fmul float undef, undef - %v2f32 = fmul <2 x float> undef, undef - %v3f32 = fmul <3 x float> undef, undef - %v4f32 = fmul <4 x float> undef, undef - %v5f32 = fmul <5 x float> undef, undef - %v8f32 = fmul <8 x float> undef, undef - %v9f32 = fmul <9 x float> undef, undef + %f32 = fmul float poison, poison + %v2f32 = fmul <2 x float> poison, poison + %v3f32 = fmul <3 x float> poison, poison + %v4f32 = fmul <4 x float> poison, poison + %v5f32 = fmul <5 x float> poison, poison + %v8f32 = fmul <8 x float> poison, poison + %v9f32 = fmul <9 x float> poison, poison ret void } define amdgpu_kernel void @fmul_f64() #0 { ; GFX90A-FASTF64-LABEL: 'fmul_f64' -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fmul double undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fmul <2 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fmul <3 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fmul <4 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fmul <5 x double> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fmul double poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fmul <2 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fmul <3 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fmul <4 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fmul <5 x double> poison, poison ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FASTF64-LABEL: 'fmul_f64' -; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fmul double undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fmul <2 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fmul <3 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fmul <4 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fmul <5 x double> undef, undef +; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fmul double poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fmul <2 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fmul <3 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fmul <4 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fmul <5 x double> poison, poison ; FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fmul_f64' -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = fmul double undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fmul <2 x double> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fmul <3 x double> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fmul <4 x double> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fmul <5 x double> undef, undef +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = fmul double poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fmul <2 x double> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fmul <3 x double> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fmul <4 x double> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fmul <5 x double> poison, poison ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-SIZE-LABEL: 'fmul_f64' -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fmul double undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fmul <2 x double> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fmul <3 x double> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fmul <4 x double> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fmul <5 x double> undef, undef +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fmul double poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fmul <2 x double> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fmul <3 x double> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fmul <4 x double> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fmul <5 x double> poison, poison ; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE-LABEL: 'fmul_f64' -; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fmul double undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fmul <2 x double> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fmul <3 x double> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fmul <4 x double> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fmul <5 x double> undef, undef +; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fmul double poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fmul <2 x double> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fmul <3 x double> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fmul <4 x double> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fmul <5 x double> poison, poison ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f64 = fmul double undef, undef - %v2f64 = fmul <2 x double> undef, undef - %v3f64 = fmul <3 x double> undef, undef - %v4f64 = fmul <4 x double> undef, undef - %v5f64 = fmul <5 x double> undef, undef + %f64 = fmul double poison, poison + %v2f64 = fmul <2 x double> poison, poison + %v3f64 = fmul <3 x double> poison, poison + %v4f64 = fmul <4 x double> poison, poison + %v5f64 = fmul <5 x double> poison, poison ret void } define amdgpu_kernel void @fmul_f16() #0 { ; GFX9-LABEL: 'fmul_f16' -; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> undef, undef +; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> poison, poison ; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fmul_f16' -; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fmul <16 x half> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fmul <17 x half> undef, undef +; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fmul <16 x half> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fmul <17 x half> poison, poison ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX9-SIZE-LABEL: 'fmul_f16' -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> undef, undef +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> poison, poison ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fmul_f16' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fmul <16 x half> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fmul <17 x half> undef, undef +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fmul <16 x half> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fmul <17 x half> poison, poison ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f16 = fmul half undef, undef - %v2f16 = fmul <2 x half> undef, undef - %v3f16 = fmul <3 x half> undef, undef - %v4f16 = fmul <4 x half> undef, undef - %v5f16 = fmul <5 x half> undef, undef - %v16f16 = fmul <16 x half> undef, undef - %v17f16 = fmul <17 x half> undef, undef + %f16 = fmul half poison, poison + %v2f16 = fmul <2 x half> poison, poison + %v3f16 = fmul <3 x half> poison, poison + %v4f16 = fmul <4 x half> poison, poison + %v5f16 = fmul <5 x half> poison, poison + %v16f16 = fmul <16 x half> poison, poison + %v17f16 = fmul <17 x half> poison, poison ret void } define amdgpu_kernel void @fmul_bf16() #0 { ; GFX9-LABEL: 'fmul_bf16' -; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = fmul bfloat undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison ; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fmul_bf16' -; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; +; GFX1250-LABEL: 'fmul_bf16' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; ; GFX9-SIZE-LABEL: 'fmul_bf16' -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fmul_bf16' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %bf16 = fmul bfloat undef, undef - %v2bf16 = fmul <2 x bfloat> undef, undef - %v3bf16 = fmul <3 x bfloat> undef, undef - %v4bf16 = fmul <4 x bfloat> undef, undef - %v5bf16 = fmul <5 x bfloat> undef, undef - %v16bf16 = fmul <16 x bfloat> undef, undef - %v17bf16 = fmul <17 x bfloat> undef, undef +; GFX1250-SIZE-LABEL: 'fmul_bf16' +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void + %bf16 = fmul bfloat poison, poison + %v2bf16 = fmul <2 x bfloat> poison, poison + %v3bf16 = fmul <3 x bfloat> poison, poison + %v4bf16 = fmul <4 x bfloat> poison, poison + %v5bf16 = fmul <5 x bfloat> poison, poison + %v16bf16 = fmul <16 x bfloat> poison, poison + %v17bf16 = fmul <17 x bfloat> poison, poison ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll index 4e71a71326bad..6b71603f70f6b 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll @@ -2,158 +2,191 @@ ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,GFX90A-FASTF64 %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32,FASTF16,FASTF64 %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32,SLOWF64 %s +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16-SIZE,GFX90A-FASTF64-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,FASTF16-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,SLOWF64-SIZE %s +; RUN opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s ; END. define amdgpu_kernel void @fsub_f32() #0 { ; GFX90A-FASTF64-LABEL: 'fsub_f32' -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fsub <2 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fsub <3 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fsub <2 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fsub <3 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> poison, poison ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; NOPACKEDF32-LABEL: 'fsub_f32' -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fsub <2 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fsub <3 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> undef, undef +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fsub <2 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fsub <3 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> poison, poison ; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-FASTF64-SIZE-LABEL: 'fsub_f32' -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fsub <2 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fsub <3 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> undef, undef +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fsub <2 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fsub <3 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> poison, poison ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; NOPACKEDF32-SIZE-LABEL: 'fsub_f32' -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fsub <2 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fsub <3 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> undef, undef +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fsub <2 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fsub <3 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> poison, poison ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f32 = fsub float undef, undef - %v2f32 = fsub <2 x float> undef, undef - %v3f32 = fsub <3 x float> undef, undef - %v4f32 = fsub <4 x float> undef, undef - %v5f32 = fsub <5 x float> undef, undef - %v8f32 = fsub <8 x float> undef, undef - %v9f32 = fsub <9 x float> undef, undef + %f32 = fsub float poison, poison + %v2f32 = fsub <2 x float> poison, poison + %v3f32 = fsub <3 x float> poison, poison + %v4f32 = fsub <4 x float> poison, poison + %v5f32 = fsub <5 x float> poison, poison + %v8f32 = fsub <8 x float> poison, poison + %v9f32 = fsub <9 x float> poison, poison ret void } define amdgpu_kernel void @fsub_f64() #0 { ; GFX90A-FASTF64-LABEL: 'fsub_f64' -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fsub double undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fsub <2 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fsub <3 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fsub <4 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fsub <5 x double> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fsub double poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fsub <2 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fsub <3 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fsub <4 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fsub <5 x double> poison, poison ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FASTF64-LABEL: 'fsub_f64' -; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fsub double undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fsub <2 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fsub <3 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fsub <4 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fsub <5 x double> undef, undef +; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fsub double poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fsub <2 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fsub <3 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fsub <4 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fsub <5 x double> poison, poison ; FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'fsub_f64' -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = fsub double undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fsub <2 x double> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fsub <3 x double> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fsub <4 x double> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fsub <5 x double> undef, undef +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = fsub double poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fsub <2 x double> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fsub <3 x double> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fsub <4 x double> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fsub <5 x double> poison, poison ; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-FASTF64-SIZE-LABEL: 'fsub_f64' -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fsub double undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fsub <2 x double> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fsub <3 x double> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fsub <4 x double> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fsub <5 x double> undef, undef +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fsub double poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fsub <2 x double> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fsub <3 x double> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fsub <4 x double> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fsub <5 x double> poison, poison ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; NOPACKEDF32-SIZE-LABEL: 'fsub_f64' -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fsub double undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fsub <2 x double> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fsub <3 x double> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fsub <4 x double> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fsub <5 x double> undef, undef +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fsub double poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fsub <2 x double> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fsub <3 x double> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fsub <4 x double> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fsub <5 x double> poison, poison ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f64 = fsub double undef, undef - %v2f64 = fsub <2 x double> undef, undef - %v3f64 = fsub <3 x double> undef, undef - %v4f64 = fsub <4 x double> undef, undef - %v5f64 = fsub <5 x double> undef, undef + %f64 = fsub double poison, poison + %v2f64 = fsub <2 x double> poison, poison + %v3f64 = fsub <3 x double> poison, poison + %v4f64 = fsub <4 x double> poison, poison + %v5f64 = fsub <5 x double> poison, poison ret void } define amdgpu_kernel void @fsub_f16() #0 { ; FASTF16-LABEL: 'fsub_f16' -; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> undef, undef +; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> poison, poison ; FASTF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'fsub_f16' -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fsub <16 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fsub <17 x half> undef, undef +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fsub <16 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fsub <17 x half> poison, poison ; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FASTF16-SIZE-LABEL: 'fsub_f16' -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> undef, undef +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> poison, poison ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOWF64-SIZE-LABEL: 'fsub_f16' -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fsub <16 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fsub <17 x half> undef, undef +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fsub <16 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fsub <17 x half> poison, poison ; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f16 = fsub half undef, undef - %v2f16 = fsub <2 x half> undef, undef - %v3f16 = fsub <3 x half> undef, undef - %v4f16 = fsub <4 x half> undef, undef - %v5f16 = fsub <5 x half> undef, undef - %v16f16 = fsub <16 x half> undef, undef - %v17f16 = fsub <17 x half> undef, undef + %f16 = fsub half poison, poison + %v2f16 = fsub <2 x half> poison, poison + %v3f16 = fsub <3 x half> poison, poison + %v4f16 = fsub <4 x half> poison, poison + %v5f16 = fsub <5 x half> poison, poison + %v16f16 = fsub <16 x half> poison, poison + %v17f16 = fsub <17 x half> poison, poison + ret void +} + +define amdgpu_kernel void @fsub_bf16() #0 { +; GFX1250-LABEL: 'fsub_bf16' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fsub bfloat poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fsub <2 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fsub <3 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fsub <4 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fsub <5 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fsub <16 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fsub <17 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; +; GFX1250-SIZE-LABEL: 'fsub_bf16' +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fsub bfloat poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fsub <2 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fsub <3 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fsub <4 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fsub <5 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fsub <16 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fsub <17 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %bf16 = fsub bfloat poison, poison + %v2bf16 = fsub <2 x bfloat> poison, poison + %v3bf16 = fsub <3 x bfloat> poison, poison + %v4bf16 = fsub <4 x bfloat> poison, poison + %v5bf16 = fsub <5 x bfloat> poison, poison + %v16bf16 = fsub <16 x bfloat> poison, poison + %v17bf16 = fsub <17 x bfloat> poison, poison ret void } diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 371e460d9638e..0490e5a19b4b7 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -10908,13 +10908,12 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] -; ; GFX1250-LABEL: v_fadd_v2bf16: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v1 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fadd <2 x bfloat> %a, %b ret <2 x bfloat> %op } @@ -11447,14 +11446,13 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] -; ; GFX1250-LABEL: v_fadd_v4bf16: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v2 -; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v3 -; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v2 +; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fadd <4 x bfloat> %a, %b ret <4 x bfloat> %op } @@ -49040,6 +49038,9 @@ declare bfloat @llvm.fma.bf16(bfloat, bfloat, bfloat) declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>) declare <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>) declare <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>) +declare <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>) +declare <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat>, <16 x bfloat>, <16 x bfloat>) +declare <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat>, <32 x bfloat>, <32 x bfloat>) define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GCN-LABEL: v_fma_bf16: @@ -49990,6 +49991,98 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ret <4 x bfloat> %op } +; GFX1250-LABEL: v_fma_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v4, v8 +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v5, v9 +; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v6, v10 +; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v7, v11 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) { + %op = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) + ret <8 x bfloat> %op +} + +; GFX1250-LABEL: v_fma_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v8, v16 +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v9, v17 +; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v10, v18 +; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v11, v19 +; GFX1250-NEXT: v_pk_fma_bf16 v4, v4, v12, v20 +; GFX1250-NEXT: v_pk_fma_bf16 v5, v5, v13, v21 +; GFX1250-NEXT: v_pk_fma_bf16 v6, v6, v14, v22 +; GFX1250-NEXT: v_pk_fma_bf16 v7, v7, v15, v23 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c) { + %op = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c) + ret <16 x bfloat> %op +} + +; GFX1250-LABEL: v_fma_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x10 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 offset:64 +; GFX1250-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX1250-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX1250-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX1250-NEXT: scratch_load_b32 v35, off, s32 offset:16 +; GFX1250-NEXT: scratch_load_b32 v36, off, s32 offset:20 +; GFX1250-NEXT: scratch_load_b32 v37, off, s32 offset:24 +; GFX1250-NEXT: scratch_load_b32 v38, off, s32 offset:28 +; GFX1250-NEXT: scratch_load_b32 v39, off, s32 offset:32 +; GFX1250-NEXT: scratch_load_b32 v48, off, s32 offset:36 +; GFX1250-NEXT: scratch_load_b32 v49, off, s32 offset:40 +; GFX1250-NEXT: scratch_load_b32 v50, off, s32 offset:44 +; GFX1250-NEXT: scratch_load_b32 v51, off, s32 offset:48 +; GFX1250-NEXT: scratch_load_b32 v52, off, s32 offset:52 +; GFX1250-NEXT: scratch_load_b32 v53, off, s32 offset:56 +; GFX1250-NEXT: scratch_load_b32 v54, off, s32 offset:60 +; GFX1250-NEXT: scratch_load_b32 v55, off, s32 +; GFX1250-NEXT: s_wait_loadcnt 0xf +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v16, v32 +; GFX1250-NEXT: s_wait_loadcnt 0xe +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v17, v33 +; GFX1250-NEXT: s_wait_loadcnt 0xd +; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v18, v34 +; GFX1250-NEXT: s_wait_loadcnt 0xc +; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v19, v35 +; GFX1250-NEXT: s_wait_loadcnt 0xb +; GFX1250-NEXT: v_pk_fma_bf16 v4, v4, v20, v36 +; GFX1250-NEXT: s_wait_loadcnt 0xa +; GFX1250-NEXT: v_pk_fma_bf16 v5, v5, v21, v37 +; GFX1250-NEXT: s_wait_loadcnt 0x9 +; GFX1250-NEXT: v_pk_fma_bf16 v6, v6, v22, v38 +; GFX1250-NEXT: s_wait_loadcnt 0x8 +; GFX1250-NEXT: v_pk_fma_bf16 v7, v7, v23, v39 +; GFX1250-NEXT: s_wait_loadcnt 0x7 +; GFX1250-NEXT: v_pk_fma_bf16 v8, v8, v24, v48 +; GFX1250-NEXT: s_wait_loadcnt 0x6 +; GFX1250-NEXT: v_pk_fma_bf16 v9, v9, v25, v49 +; GFX1250-NEXT: s_wait_loadcnt 0x5 +; GFX1250-NEXT: v_pk_fma_bf16 v10, v10, v26, v50 +; GFX1250-NEXT: s_wait_loadcnt 0x4 +; GFX1250-NEXT: v_pk_fma_bf16 v11, v11, v27, v51 +; GFX1250-NEXT: s_wait_loadcnt 0x3 +; GFX1250-NEXT: v_pk_fma_bf16 v12, v12, v28, v52 +; GFX1250-NEXT: s_wait_loadcnt 0x2 +; GFX1250-NEXT: v_pk_fma_bf16 v13, v13, v29, v53 +; GFX1250-NEXT: s_wait_loadcnt 0x1 +; GFX1250-NEXT: v_pk_fma_bf16 v14, v14, v30, v54 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v15, v15, v55, v31 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c) { + %op = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c) + ret <32 x bfloat> %op +} + declare bfloat @llvm.fmuladd.bf16(bfloat, bfloat, bfloat) declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>) declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>) diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll new file mode 100644 index 0000000000000..a4cdb0387df9a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll @@ -0,0 +1,1292 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250 %s + +declare bfloat @llvm.fabs.bf16(bfloat) #0 +declare bfloat @llvm.canonicalize.bf16(bfloat) #0 +declare <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat>) #0 +declare <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat>) #0 +declare <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat>) #0 +declare <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat>) #0 +declare <6 x bfloat> @llvm.canonicalize.v6bf16(<6 x bfloat>) #0 +declare <8 x bfloat> @llvm.canonicalize.v8bf16(<8 x bfloat>) #0 +declare <12 x bfloat> @llvm.canonicalize.v12bf16(<12 x bfloat>) #0 +declare <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat>) #0 +declare <32 x bfloat> @llvm.canonicalize.v32bf16(<32 x bfloat>) #0 +declare <64 x bfloat> @llvm.canonicalize.v64bf16(<64 x bfloat>) #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +; GFX1250-LABEL: test_fold_canonicalize_undef_value_bf16: +; GFX1250: %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v0, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_undef_value_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat undef) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +; GFX1250-LABEL: v_test_canonicalize_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v0, v0, s[0:1] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: global_store_b16 v[0:1], v0, off +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_canonicalize_var_bf16(ptr addrspace(1) %out) #1 { + %val = load bfloat, ptr addrspace(1) %out + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val) + store bfloat %canonicalized, ptr addrspace(1) poison + ret void +} + +; GFX1250-LABEL: s_test_canonicalize_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_lshl_b32 s2, s2, 16 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e64 v0, s2, s2 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @s_test_canonicalize_var_bf16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 { + %val = bitcast i16 %val.arg to bfloat + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +; GFX1250-LABEL: v_test_canonicalize_build_vector_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <2 x bfloat> @v_test_canonicalize_build_vector_v2bf16(bfloat %lo, bfloat %hi) #1 { + %ins0 = insertelement <2 x bfloat> poison, bfloat %lo, i32 0 + %ins1 = insertelement <2 x bfloat> %ins0, bfloat %hi, i32 1 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %ins1) + ret <2 x bfloat> %canonicalized +} + + +; GFX1250-LABEL: v_test_canonicalize_fabs_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_canonicalize_fabs_var_bf16(ptr addrspace(1) %out) #1 { + %val = load bfloat, ptr addrspace(1) %out + %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val) + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fabs) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + + +; GFX1250-LABEL: v_test_canonicalize_fneg_fabs_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_bf16(ptr addrspace(1) %out) #1 { + %val = load bfloat, ptr addrspace(1) %out + %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val) + %val.fabs.fneg = fneg bfloat %val.fabs + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fabs.fneg) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +; GFX1250-LABEL: v_test_canonicalize_fneg_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_canonicalize_fneg_var_bf16(ptr addrspace(1) %out) #1 { + %val = load bfloat, ptr addrspace(1) %out + %val.fneg = fneg bfloat %val + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fneg) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +; GFX1250-LABEL: v_test_no_denormals_canonicalize_fneg_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_bf16(ptr addrspace(1) %out) #2 { + %val = load bfloat, ptr addrspace(1) %out + %val.fneg = fneg bfloat %val + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fneg) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +; GFX1250-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_bf16(ptr addrspace(1) %out) #2 { + %val = load bfloat, ptr addrspace(1) %out + %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val) + %val.fabs.fneg = fneg bfloat %val.fabs + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fabs.fneg) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +; GFX1250-LABEL: test_fold_canonicalize_p0_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + define amdgpu_kernel void @test_fold_canonicalize_p0_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0.0) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_n0_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +; GFX1250-NEXT: .Lfunc_end10: +define amdgpu_kernel void @test_fold_canonicalize_n0_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat -0.0) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_p1_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f80 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_p1_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 1.0) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_n1_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbf80 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +; GFX1250-NEXT: .Lfunc_end12: +define amdgpu_kernel void @test_fold_canonicalize_n1_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat -1.0) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_literal_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4180 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_literal_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 16.0) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal0_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR03FF) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal0_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_bf16(ptr addrspace(1) %out) #3 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR03FF) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal1_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR83FF) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal1_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_bf16(ptr addrspace(1) %out) #3 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR83FF) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_qnan_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_qnan_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR7C00) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg1_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 -1 to bfloat)) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg2_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 -2 to bfloat)) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_snan0_value_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c01 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_snan0_value_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR7C01) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_snan1_value_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7dff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_snan1_value_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR7DFF) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_snan2_value_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffdff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_snan2_value_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xRFDFF) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_snan3_value_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffc01 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_snan3_value_bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xRFC01) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: v_test_canonicalize_var_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_canonicalize_var_v2bf16(ptr addrspace(1) %out) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid + %val = load <2 x bfloat>, ptr addrspace(1) %gep + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %val) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: v_test_canonicalize_fabs_var_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2bf16(ptr addrspace(1) %out) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid + %val = load <2 x bfloat>, ptr addrspace(1) %gep + %val.fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %val) + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %val.fabs) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: v_test_canonicalize_fneg_fabs_var_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2bf16(ptr addrspace(1) %out) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid + %val = load <2 x bfloat>, ptr addrspace(1) %gep + %val.fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %val) + %val.fabs.fneg = fneg <2 x bfloat> %val.fabs + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %val.fabs.fneg) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: v_test_canonicalize_fneg_var_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2bf16(ptr addrspace(1) %out) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid + %val = load <2 x bfloat>, ptr addrspace(1) %gep + %fneg.val = fneg <2 x bfloat> %val + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %fneg.val) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: s_test_canonicalize_var_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX1250-NEXT: s_lshl_b32 s2, s2, 16 +; GFX1250-NEXT: v_max_num_f32_e64 v0, s3, s3 +; GFX1250-NEXT: v_max_num_f32_e64 v1, s2, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v1, v0 +; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @s_test_canonicalize_var_v2bf16(ptr addrspace(1) %out, i32 zeroext %val.arg) #1 { + %val = bitcast i32 %val.arg to <2 x bfloat> + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %val) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_p0_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_p0_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> zeroinitializer) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_n0_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_n0_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_p1_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f803f80 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_p1_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_n1_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbf80bf80 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_n1_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_literal_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41804180 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_literal_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal0_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2bf16(ptr addrspace(1) %out) #3 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal1_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2bf16(ptr addrspace(1) %out) #3 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_qnan_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_qnan_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg1_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc07fc0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> bitcast (i32 -1 to <2 x bfloat>)) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg2_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc07fc0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_snan0_value_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c017c01 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_snan1_value_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7dff7dff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_snan2_value_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfdfffdff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: test_fold_canonicalize_snan3_value_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfc01fc01 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: v_test_canonicalize_var_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v0, v0, v0 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <3 x bfloat> @v_test_canonicalize_var_v3bf16(<3 x bfloat> %val) #1 { + %canonicalized = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> %val) + ret <3 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_var_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3 +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <4 x bfloat> @v_test_canonicalize_var_v4bf16(<4 x bfloat> %val) #1 { + %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %val) + ret <4 x bfloat> %canonicalized +} +; GFX1250-LABEL: s_test_canonicalize_undef_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @s_test_canonicalize_undef_v2bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: v_test_canonicalize_reg_undef_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_movk_i32 s0, 0x7fc0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <2 x bfloat> @v_test_canonicalize_reg_undef_v2bf16(bfloat %val) #1 { + %vec = insertelement <2 x bfloat> poison, bfloat %val, i32 0 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) + ret <2 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_undef_reg_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_movk_i32 s0, 0x7fc0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_perm_b32 v0, v0, s0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <2 x bfloat> @v_test_canonicalize_undef_reg_v2bf16(bfloat %val) #1 { + %vec = insertelement <2 x bfloat> poison, bfloat %val, i32 1 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) + ret <2 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <2 x bfloat> @v_test_canonicalize_undef_lo_imm_hi_v2bf16() #1 { + %vec = insertelement <2 x bfloat> undef, bfloat 1.0, i32 1 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) + ret <2 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0x3f80 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <2 x bfloat> @v_test_canonicalize_imm_lo_undef_hi_v2bf16() #1 { + %vec = insertelement <2 x bfloat> undef, bfloat 1.0, i32 0 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) + ret <2 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_undef_lo_k_hi_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0x41800000 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <2 x bfloat> @v_test_canonicalize_undef_lo_k_hi_v2bf16() #1 { + %vec = insertelement <2 x bfloat> undef, bfloat 16.0, i32 1 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) + ret <2 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_k_lo_undef_hi_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0x4180 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <2 x bfloat> @v_test_canonicalize_k_lo_undef_hi_v2bf16() #1 { + %vec = insertelement <2 x bfloat> undef, bfloat 16.0, i32 0 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) + ret <2 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_reg_k_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_movk_i32 s0, 0x4000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <2 x bfloat> @v_test_canonicalize_reg_k_v2bf16(bfloat %val) #1 { + %vec0 = insertelement <2 x bfloat> poison, bfloat %val, i32 0 + %vec1 = insertelement <2 x bfloat> %vec0, bfloat 2.0, i32 1 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec1) + ret <2 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_k_reg_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_movk_i32 s0, 0x4000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_perm_b32 v0, v0, s0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <2 x bfloat> @v_test_canonicalize_k_reg_v2bf16(bfloat %val) #1 { + %vec0 = insertelement <2 x bfloat> poison, bfloat 2.0, i32 0 + %vec1 = insertelement <2 x bfloat> %vec0, bfloat %val, i32 1 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec1) + ret <2 x bfloat> %canonicalized +} +; GFX1250-LABEL: s_test_canonicalize_undef_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm +define amdgpu_kernel void @s_test_canonicalize_undef_v4bf16(ptr addrspace(1) %out) #1 { + %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef) + store <4 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} +; GFX1250-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, 0x7fc07fc0 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_movk_i32 s0, 0x7fc0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <4 x bfloat> @v_test_canonicalize_reg_undef_undef_undef_v4bf16(bfloat %val) #1 { + %vec = insertelement <4 x bfloat> poison, bfloat %val, i32 0 + %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %vec) + ret <4 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0x7fc07fc0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <4 x bfloat> @v_test_canonicalize_reg_reg_undef_undef_v4bf16(bfloat %val0, bfloat %val1) #1 { + %vec0 = insertelement <4 x bfloat> poison, bfloat %val0, i32 0 + %vec1 = insertelement <4 x bfloat> %vec0, bfloat %val1, i32 1 + %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %vec1) + ret <4 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_movk_i32 s0, 0x7fc0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <4 x bfloat> @v_test_canonicalize_reg_undef_reg_reg_v4bf16(bfloat %val0, bfloat %val1, bfloat %val2) #1 { + %vec0 = insertelement <4 x bfloat> poison, bfloat %val0, i32 0 + %vec1 = insertelement <4 x bfloat> %vec0, bfloat %val1, i32 2 + %vec2 = insertelement <4 x bfloat> %vec1, bfloat %val2, i32 3 + %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %vec2) + ret <4 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_var_v6bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX1250-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4 +; GFX1250-NEXT: v_dual_max_num_f32 v5, v5, v5 :: v_dual_max_num_f32 v0, v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v2, v2, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <6 x bfloat> @v_test_canonicalize_var_v6bf16(<6 x bfloat> %val) #1 { + %canonicalized = call <6 x bfloat> @llvm.canonicalize.v6bf16(<6 x bfloat> %val) + ret <6 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_var_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX1250-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_max_num_f32 v5, v5, v5 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v6, v6, v6 +; GFX1250-NEXT: v_dual_max_num_f32 v7, v7, v7 :: v_dual_max_num_f32 v0, v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2 +; GFX1250-NEXT: v_max_num_f32_e32 v3, v3, v3 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v2, v2, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v3, v3, v4 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <8 x bfloat> @v_test_canonicalize_var_v8bf16(<8 x bfloat> %val) #1 { + %canonicalized = call <8 x bfloat> @llvm.canonicalize.v8bf16(<8 x bfloat> %val) + ret <8 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_var_v12bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX1250-NEXT: v_and_b32_e32 v8, 0xffff0000, v3 +; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v4 :: v_dual_lshlrev_b32 v3, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v10, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX1250-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v5, v5, v5 +; GFX1250-NEXT: v_dual_max_num_f32 v7, v7, v7 :: v_dual_max_num_f32 v8, v8, v8 +; GFX1250-NEXT: v_dual_max_num_f32 v9, v9, v9 :: v_dual_max_num_f32 v10, v10, v10 +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v11, v11, v11 +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2 +; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v11 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v2, v2, v9 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v4, v4, v7 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v5, v5, v6 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <12 x bfloat> @v_test_canonicalize_var_v12bf16(<12 x bfloat> %val) #1 { + %canonicalized = call <12 x bfloat> @llvm.canonicalize.v12bf16(<12 x bfloat> %val) + ret <12 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_var_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX1250-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 +; GFX1250-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_dual_max_num_f32 v8, v8, v8 :: v_dual_lshlrev_b32 v5, 16, v5 +; GFX1250-NEXT: v_dual_max_num_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v6, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v4 +; GFX1250-NEXT: v_and_b32_e32 v12, 0xffff0000, v3 +; GFX1250-NEXT: v_and_b32_e32 v13, 0xffff0000, v2 +; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v4 :: v_dual_lshlrev_b32 v3, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v14, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_dual_max_num_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX1250-NEXT: v_max_num_f32_e32 v6, v6, v6 +; GFX1250-NEXT: v_dual_max_num_f32 v10, v10, v10 :: v_dual_max_num_f32 v5, v5, v5 +; GFX1250-NEXT: v_dual_max_num_f32 v11, v11, v11 :: v_dual_max_num_f32 v12, v12, v12 +; GFX1250-NEXT: v_dual_max_num_f32 v13, v13, v13 :: v_dual_max_num_f32 v14, v14, v14 +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v15, v15, v15 +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2 +; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v15 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v14 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v2, v2, v13 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v4, v4, v11 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v5, v5, v10 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v6, v6, v9 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v7, v7, v8 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <16 x bfloat> @v_test_canonicalize_var_v16bf16(<16 x bfloat> %val) #1 { + %canonicalized = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> %val) + ret <16 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_var_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX1250-NEXT: v_and_b32_e32 v18, 0xffff0000, v13 +; GFX1250-NEXT: v_and_b32_e32 v20, 0xffff0000, v11 +; GFX1250-NEXT: v_and_b32_e32 v22, 0xffff0000, v9 +; GFX1250-NEXT: v_and_b32_e32 v24, 0xffff0000, v7 +; GFX1250-NEXT: v_dual_max_num_f32 v16, v16, v16 :: v_dual_lshlrev_b32 v15, 16, v15 +; GFX1250-NEXT: v_and_b32_e32 v17, 0xffff0000, v14 +; GFX1250-NEXT: v_dual_lshlrev_b32 v14, 16, v14 :: v_dual_lshlrev_b32 v13, 16, v13 +; GFX1250-NEXT: v_max_num_f32_e32 v18, v18, v18 +; GFX1250-NEXT: v_and_b32_e32 v19, 0xffff0000, v12 +; GFX1250-NEXT: v_dual_lshlrev_b32 v12, 16, v12 :: v_dual_lshlrev_b32 v11, 16, v11 +; GFX1250-NEXT: v_max_num_f32_e32 v20, v20, v20 +; GFX1250-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; GFX1250-NEXT: v_dual_lshlrev_b32 v10, 16, v10 :: v_dual_lshlrev_b32 v9, 16, v9 +; GFX1250-NEXT: v_max_num_f32_e32 v22, v22, v22 +; GFX1250-NEXT: v_and_b32_e32 v23, 0xffff0000, v8 +; GFX1250-NEXT: v_dual_lshlrev_b32 v8, 16, v8 :: v_dual_lshlrev_b32 v7, 16, v7 +; GFX1250-NEXT: v_max_num_f32_e32 v24, v24, v24 +; GFX1250-NEXT: v_and_b32_e32 v25, 0xffff0000, v6 +; GFX1250-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v26, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-NEXT: v_and_b32_e32 v27, 0xffff0000, v4 +; GFX1250-NEXT: v_and_b32_e32 v28, 0xffff0000, v3 +; GFX1250-NEXT: v_and_b32_e32 v29, 0xffff0000, v2 +; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v4 :: v_dual_lshlrev_b32 v3, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v30, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v31, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_dual_max_num_f32 v15, v15, v15 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX1250-NEXT: v_dual_max_num_f32 v17, v17, v17 :: v_dual_max_num_f32 v14, v14, v14 +; GFX1250-NEXT: v_dual_max_num_f32 v13, v13, v13 :: v_dual_max_num_f32 v19, v19, v19 +; GFX1250-NEXT: v_dual_max_num_f32 v12, v12, v12 :: v_dual_max_num_f32 v11, v11, v11 +; GFX1250-NEXT: v_dual_max_num_f32 v21, v21, v21 :: v_dual_max_num_f32 v10, v10, v10 +; GFX1250-NEXT: v_dual_max_num_f32 v9, v9, v9 :: v_dual_max_num_f32 v23, v23, v23 +; GFX1250-NEXT: v_dual_max_num_f32 v8, v8, v8 :: v_dual_max_num_f32 v7, v7, v7 +; GFX1250-NEXT: v_dual_max_num_f32 v25, v25, v25 :: v_dual_max_num_f32 v6, v6, v6 +; GFX1250-NEXT: v_dual_max_num_f32 v26, v26, v26 :: v_dual_max_num_f32 v5, v5, v5 +; GFX1250-NEXT: v_dual_max_num_f32 v27, v27, v27 :: v_dual_max_num_f32 v28, v28, v28 +; GFX1250-NEXT: v_dual_max_num_f32 v29, v29, v29 :: v_dual_max_num_f32 v30, v30, v30 +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v31, v31, v31 +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2 +; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v31 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v30 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v2, v2, v29 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v3, v3, v28 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v4, v4, v27 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v5, v5, v26 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v6, v6, v25 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v7, v7, v24 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v8, v8, v23 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v9, v9, v22 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v10, v10, v21 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v11, v11, v20 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v12, v12, v19 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v13, v13, v18 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v14, v14, v17 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v15, v15, v16 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <32 x bfloat> @v_test_canonicalize_var_v32bf16(<32 x bfloat> %val) #1 { + %canonicalized = call <32 x bfloat> @llvm.canonicalize.v32bf16(<32 x bfloat> %val) + ret <32 x bfloat> %canonicalized +} +; GFX1250-LABEL: v_test_canonicalize_var_v64bf16: +; GFX1250: %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: v_and_b32_e32 v81, 0xffff0000, v0 +; GFX1250-NEXT: v_and_b32_e32 v38, 0xffff0000, v24 +; GFX1250-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX1250-NEXT: v_and_b32_e32 v39, 0xffff0000, v23 +; GFX1250-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX1250-NEXT: v_and_b32_e32 v80, 0xffff0000, v6 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v82, 0xffff0000, v1 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v81, v81, v81 +; GFX1250-NEXT: v_and_b32_e32 v83, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX1250-NEXT: v_and_b32_e32 v34, 0xffff0000, v28 +; GFX1250-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX1250-NEXT: v_and_b32_e32 v35, 0xffff0000, v27 +; GFX1250-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX1250-NEXT: v_and_b32_e32 v36, 0xffff0000, v26 +; GFX1250-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX1250-NEXT: v_and_b32_e32 v48, 0xffff0000, v22 +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v82, v82, v82 +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v83, v83, v83 +; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v24, v24, v24 +; GFX1250-NEXT: v_max_num_f32_e32 v39, v39, v39 +; GFX1250-NEXT: v_dual_max_num_f32 v23, v23, v23 :: v_dual_max_num_f32 v48, v48, v48 +; GFX1250-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; GFX1250-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX1250-NEXT: v_and_b32_e32 v33, 0xffff0000, v29 +; GFX1250-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX1250-NEXT: v_and_b32_e32 v37, 0xffff0000, v25 +; GFX1250-NEXT: v_dual_lshlrev_b32 v25, 16, v25 :: v_dual_lshlrev_b32 v22, 16, v22 +; GFX1250-NEXT: v_and_b32_e32 v49, 0xffff0000, v21 +; GFX1250-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX1250-NEXT: v_and_b32_e32 v50, 0xffff0000, v20 +; GFX1250-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX1250-NEXT: v_and_b32_e32 v51, 0xffff0000, v19 +; GFX1250-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX1250-NEXT: v_and_b32_e32 v52, 0xffff0000, v18 +; GFX1250-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX1250-NEXT: v_and_b32_e32 v53, 0xffff0000, v17 +; GFX1250-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX1250-NEXT: v_and_b32_e32 v54, 0xffff0000, v16 +; GFX1250-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX1250-NEXT: v_and_b32_e32 v55, 0xffff0000, v15 +; GFX1250-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX1250-NEXT: v_and_b32_e32 v64, 0xffff0000, v14 +; GFX1250-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX1250-NEXT: v_and_b32_e32 v65, 0xffff0000, v13 +; GFX1250-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX1250-NEXT: v_and_b32_e32 v66, 0xffff0000, v12 +; GFX1250-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX1250-NEXT: v_and_b32_e32 v67, 0xffff0000, v11 +; GFX1250-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX1250-NEXT: v_and_b32_e32 v68, 0xffff0000, v10 +; GFX1250-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX1250-NEXT: v_and_b32_e32 v69, 0xffff0000, v9 +; GFX1250-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX1250-NEXT: v_and_b32_e32 v70, 0xffff0000, v8 +; GFX1250-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX1250-NEXT: v_and_b32_e32 v71, 0xffff0000, v7 +; GFX1250-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v81 +; GFX1250-NEXT: v_and_b32_e32 v81, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v82 +; GFX1250-NEXT: v_and_b32_e32 v82, 0xffff0000, v4 +; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v2, v2, v83 +; GFX1250-NEXT: v_and_b32_e32 v83, 0xffff0000, v3 +; GFX1250-NEXT: v_dual_max_num_f32 v32, v32, v32 :: v_dual_lshlrev_b32 v3, 16, v3 +; GFX1250-NEXT: v_dual_max_num_f32 v27, v27, v27 :: v_dual_max_num_f32 v36, v36, v36 +; GFX1250-NEXT: v_dual_max_num_f32 v26, v26, v26 :: v_dual_max_num_f32 v37, v37, v37 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v23, v23, v39 +; GFX1250-NEXT: v_dual_max_num_f32 v30, v30, v30 :: v_dual_max_num_f32 v33, v33, v33 +; GFX1250-NEXT: v_dual_max_num_f32 v29, v29, v29 :: v_dual_max_num_f32 v34, v34, v34 +; GFX1250-NEXT: v_dual_max_num_f32 v28, v28, v28 :: v_dual_max_num_f32 v35, v35, v35 +; GFX1250-NEXT: v_dual_max_num_f32 v25, v25, v25 :: v_dual_max_num_f32 v38, v38, v38 +; GFX1250-NEXT: v_dual_max_num_f32 v22, v22, v22 :: v_dual_max_num_f32 v49, v49, v49 +; GFX1250-NEXT: v_dual_max_num_f32 v21, v21, v21 :: v_dual_max_num_f32 v50, v50, v50 +; GFX1250-NEXT: v_dual_max_num_f32 v20, v20, v20 :: v_dual_max_num_f32 v51, v51, v51 +; GFX1250-NEXT: v_dual_max_num_f32 v19, v19, v19 :: v_dual_max_num_f32 v52, v52, v52 +; GFX1250-NEXT: v_dual_max_num_f32 v18, v18, v18 :: v_dual_max_num_f32 v53, v53, v53 +; GFX1250-NEXT: v_dual_max_num_f32 v17, v17, v17 :: v_dual_max_num_f32 v54, v54, v54 +; GFX1250-NEXT: v_dual_max_num_f32 v16, v16, v16 :: v_dual_max_num_f32 v55, v55, v55 +; GFX1250-NEXT: v_dual_max_num_f32 v15, v15, v15 :: v_dual_max_num_f32 v64, v64, v64 +; GFX1250-NEXT: v_dual_max_num_f32 v14, v14, v14 :: v_dual_max_num_f32 v65, v65, v65 +; GFX1250-NEXT: v_dual_max_num_f32 v13, v13, v13 :: v_dual_max_num_f32 v66, v66, v66 +; GFX1250-NEXT: v_dual_max_num_f32 v12, v12, v12 :: v_dual_max_num_f32 v67, v67, v67 +; GFX1250-NEXT: v_dual_max_num_f32 v11, v11, v11 :: v_dual_max_num_f32 v68, v68, v68 +; GFX1250-NEXT: v_dual_max_num_f32 v10, v10, v10 :: v_dual_max_num_f32 v69, v69, v69 +; GFX1250-NEXT: v_dual_max_num_f32 v9, v9, v9 :: v_dual_max_num_f32 v70, v70, v70 +; GFX1250-NEXT: v_dual_max_num_f32 v8, v8, v8 :: v_dual_max_num_f32 v71, v71, v71 +; GFX1250-NEXT: v_dual_max_num_f32 v80, v80, v80 :: v_dual_max_num_f32 v81, v81, v81 +; GFX1250-NEXT: v_dual_max_num_f32 v82, v82, v82 :: v_dual_max_num_f32 v83, v83, v83 +; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4 +; GFX1250-NEXT: v_dual_max_num_f32 v5, v5, v5 :: v_dual_max_num_f32 v6, v6, v6 +; GFX1250-NEXT: v_max_num_f32_e32 v7, v7, v7 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v26, v26, v36 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v3, v3, v83 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v4, v4, v82 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v5, v5, v81 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v6, v6, v80 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v7, v7, v71 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v8, v8, v70 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v9, v9, v69 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v10, v10, v68 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v11, v11, v67 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v12, v12, v66 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v13, v13, v65 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v14, v14, v64 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v15, v15, v55 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v16, v16, v54 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v17, v17, v53 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v18, v18, v52 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v19, v19, v51 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v20, v20, v50 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v21, v21, v49 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v22, v22, v48 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v24, v24, v38 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v25, v25, v37 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v27, v27, v35 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v28, v28, v34 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v29, v29, v33 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v30, v30, v32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v39, 0xffff0000, v31 +; GFX1250-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_max_num_f32_e32 v36, v39, v39 +; GFX1250-NEXT: v_max_num_f32_e32 v31, v31, v31 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v31, v31, v36 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +define <64 x bfloat> @v_test_canonicalize_var_v64bf16(<64 x bfloat> %val) #1 { + %canonicalized = call <64 x bfloat> @llvm.canonicalize.v64bf16(<64 x bfloat> %val) + ret <64 x bfloat> %canonicalized +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } +attributes #3 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }