From 8ca54e68c9e4b594663f0f3c404f2573a56a37e1 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@amd.com>
Date: Sat, 13 Sep 2025 00:49:08 +0100
Subject: [PATCH 1/4] [AMDGPU] Fix vector legalization for bf16 valu ops

Add v4,v8,v16,v32 legalizations for the following operations:
- FADD
- FMUL
- FMA
- FCANONICALIZE
---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  6 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  7 ++
 .../Analysis/CostModel/AMDGPU/canonicalize.ll | 68 +++++++++++++++++--
 llvm/test/CodeGen/AMDGPU/bf16.ll              | 30 +++++---
 4 files changed, 93 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 3e2b2c3510569..b07e936c494f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -607,6 +607,8 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
   case ISD::FSUB:
     if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
       NElts = (NElts + 1) / 2;
+    if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
+      NElts = (NElts + 1) / 2;
     if (SLT == MVT::f64)
       return LT.first * NElts * get64BitInstrCost(CostKind);
 
@@ -746,7 +748,9 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
 
   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
 
-  if ((ST->hasVOP3PInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||
+  if ((ST->hasVOP3PInsts() &&
+       (SLT == MVT::f16 || SLT == MVT::i16 ||
+        (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
       (ST->hasPackedFP32Ops() && SLT == MVT::f32))
     NElts = (NElts + 1) / 2;
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a53beaa2b6f91..6f761bc1dc11e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -851,6 +851,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
                        Custom);
 
+    if (Subtarget->hasBF16PackedInsts()) {
+      for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
+        // Split vector operations.
+        setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
+                           VT, Custom);
+    }
+
     if (Subtarget->hasPackedFP32Ops()) {
       setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
                          MVT::v2f32, Legal);
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
index 7ac4db3119210..904db9064a369 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
@@ -3,11 +3,13 @@
 ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
 ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
 ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=GFX1250 %s
 
 ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
 ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
 ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
 ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=GFX1250-SIZE %s
 
 define void @canonicalize_f16() {
 ; BASE-LABEL: 'canonicalize_f16'
@@ -141,6 +143,16 @@ define void @canonicalize_bf16() {
 ; GFX10-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
 ; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
+; GFX1250-LABEL: 'canonicalize_bf16'
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction:   %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction:   %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction:   ret void
+;
 ; BASE-SIZE-LABEL: 'canonicalize_bf16'
 ; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
 ; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
@@ -181,6 +193,15 @@ define void @canonicalize_bf16() {
 ; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
 ; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+; GFX1250-SIZE-LABEL: 'canonicalize_bf16'
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction:   %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction:   %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   ret void
   %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) #1
   %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) #1
   %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef) #1
@@ -203,6 +224,17 @@ define void @canonicalize_f32() {
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
+; GFX1250-LABEL: 'canonicalize_f32'
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %f32 = call float @llvm.canonicalize.f32(float undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 15 for instruction:   %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 24 for instruction:   %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction:   ret void
+;
 ; ALL-SIZE-LABEL: 'canonicalize_f32'
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef)
@@ -214,6 +246,16 @@ define void @canonicalize_f32() {
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+; GFX1250-SIZE-LABEL: 'canonicalize_f32':
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %f32 = call float @llvm.canonicalize.f32(float undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction:   %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction:   %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   ret void
   %f32 = call float @llvm.canonicalize.f32(float undef) #1
   %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) #1
   %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef) #1
@@ -236,6 +278,16 @@ define void @canonicalize_f64() {
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
+; GFX1250-LABEL: 'canonicalize_f64'
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %f64 = call double @llvm.canonicalize.f64(double undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction:   %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 12 for instruction:   %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 16 for instruction:   %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction:   %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction:   %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 320 for instruction:   %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction:   ret void
+;
 ; ALL-SIZE-LABEL: 'canonicalize_f64'
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef)
@@ -245,6 +297,16 @@ define void @canonicalize_f64() {
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX1250-SIZE-LABEL: 'canonicalize_f64'
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f64 = call double @llvm.canonicalize.f64(double undef) #1
   %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) #1
@@ -255,9 +317,3 @@ define void @canonicalize_f64() {
   %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) #1
   ret void
 }
-
-
-
-
-
-
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 371e460d9638e..b5d7ddf5aee89 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -7,10 +7,14 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16
+<<<<<<< HEAD
 ; xUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250TRUE16
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250FAKE16
 
 ; FIXME: real-true16 version of gfx1250 test fails
+=======
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 | FileCheck %s -check-prefixes=GFX1250
+>>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops)
 
 define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_load_store:
@@ -10908,13 +10912,12 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX1250-LABEL: v_fadd_v2bf16:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_pk_add_bf16 v0, v0, v1
-; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+; GFX1250-NEXT:   s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:   s_wait_kmcnt 0x0
+; GFX1250-NEXT:   v_pk_add_bf16 v0, v0, v1
+; GFX1250-NEXT:   s_set_pc_i64 s[30:31]
   %op = fadd <2 x bfloat> %a, %b
   ret <2 x bfloat> %op
 }
@@ -11447,14 +11450,13 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX1250-LABEL: v_fadd_v4bf16:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_pk_add_bf16 v0, v0, v2
-; GFX1250-NEXT:    v_pk_add_bf16 v1, v1, v3
-; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+; GFX1250-NEXT:   s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:   s_wait_kmcnt 0x0
+; GFX1250-NEXT:   v_pk_add_bf16 v0, v0, v2
+; GFX1250-NEXT:   v_pk_add_bf16 v1, v1, v3
+; GFX1250-NEXT:   s_set_pc_i64 s[30:31]
   %op = fadd <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
 }
@@ -49361,7 +49363,10 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+<<<<<<< HEAD
 ;
+=======
+>>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops)
 ; GFX1250-LABEL: v_fma_v2bf16:
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -49978,7 +49983,10 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+<<<<<<< HEAD
 ;
+=======
+>>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops)
 ; GFX1250-LABEL: v_fma_v4bf16:
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0

From 3f99c0c409dc42244321f62833a97c573263d948 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@amd.com>
Date: Fri, 19 Sep 2025 09:38:09 +0100
Subject: [PATCH 2/4] Add testing coverage - part I

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   11 +-
 llvm/test/Analysis/CostModel/AMDGPU/fadd.ll   |   31 +
 llvm/test/Analysis/CostModel/AMDGPU/fma.ll    |   20 +
 llvm/test/Analysis/CostModel/AMDGPU/fmul.ll   |   21 +
 llvm/test/Analysis/CostModel/AMDGPU/fsub.ll   |   33 +
 llvm/test/CodeGen/AMDGPU/bf16.ll              |   10 +
 .../test/CodeGen/AMDGPU/fcanonicalize.bf16.ll | 1292 +++++++++++++++++
 7 files changed, 1414 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6f761bc1dc11e..77f0c55100981 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6628,10 +6628,12 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
                                              SelectionDAG &DAG) const {
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
-  assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
-         VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
-         VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
-         VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
+  assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
+         VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
+         VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
+         VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
+         VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
+         VT == MVT::v32bf16);
 
   auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
 
@@ -6686,6 +6688,7 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
                                                SelectionDAG &DAG) const {
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
+  VT.dump();
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
          VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
          VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
index 55994d865fa6c..d9729479f7410 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
@@ -2,9 +2,11 @@
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,GFX90A-FASTF64 %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32,FASTF16,FASTF64 %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32,SLOWF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16-SIZE,GFX90A-FASTF64-SIZE %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,FASTF16-SIZE %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,SLOWF64-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s
 ; END.
 
 define amdgpu_kernel void @fadd_f32() #0 {
@@ -158,4 +160,33 @@ define amdgpu_kernel void @fadd_f16() #0 {
   ret void
 }
 
+define amdgpu_kernel void @fadd_bf16() #0 {
+; GFX1250-LABEL: 'fadd_bf16'
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %bf16 = fadd bfloat undef, undef
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v2bf16 = fadd <2 x bfloat> undef, undef
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v3bf16 = fadd <3 x bfloat> undef, undef
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v4bf16 = fadd <4 x bfloat> undef, undef
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v5bf16 = fadd <5 x bfloat> undef, undef
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v16bf16 = fadd <16 x bfloat> undef, undef
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v17bf16 = fadd <17 x bfloat> undef, undef
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction:   ret void
+; GFX1250-SIZE-LABEL: 'fadd_bf16'
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %bf16 = fadd bfloat undef, undef
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v2bf16 = fadd <2 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v3bf16 = fadd <3 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v4bf16 = fadd <4 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v5bf16 = fadd <5 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v16bf16 = fadd <16 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v17bf16 = fadd <17 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   ret void
+  %bf16 = fadd bfloat undef, undef
+  %v2bf16 = fadd <2 x bfloat> undef, undef
+  %v3bf16 = fadd <3 x bfloat> undef, undef
+  %v4bf16 = fadd <4 x bfloat> undef, undef
+  %v5bf16 = fadd <5 x bfloat> undef, undef
+  %v16bf16 = fadd <16 x bfloat> undef, undef
+  %v17bf16 = fadd <17 x bfloat> undef, undef
+  ret void
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
index 2ff9d4f7f5e38..db2170af2c801 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
@@ -2,10 +2,12 @@
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s
 
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=SLOW-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s
 
 
 define void @fma_f16() {
@@ -100,6 +102,24 @@ define void @fma_bf16() {
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+; GFX1250-LABEL: 'fma_bf16'
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+; GFX1250-SIZE-LABEL: 'fma_bf16'
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
   %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
   %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
   %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
index adc4eea309a58..5315852d3225c 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
@@ -2,9 +2,11 @@
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX9,GFX90A-FASTF64 %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX9,F32,FASTF64 %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=F32,SLOW %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX9-SIZE,GFX90A-SIZE %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,SLOW-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa  -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s
 ; END.
 
 define amdgpu_kernel void @fmul_f32() #0 {
@@ -179,6 +181,16 @@ define amdgpu_kernel void @fmul_bf16() #0 {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
+; GFX1250-LABEL: 'fmul_bf16'
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = fmul bfloat undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
 ; GFX9-SIZE-LABEL: 'fmul_bf16'
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
@@ -199,6 +211,15 @@ define amdgpu_kernel void @fmul_bf16() #0 {
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+; GFX1250-SIZE-LABEL: 'fmul_bf16'
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
   %bf16 = fmul bfloat undef, undef
   %v2bf16 = fmul <2 x bfloat> undef, undef
   %v3bf16 = fmul <3 x bfloat> undef, undef
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
index 4e71a71326bad..61929a64244d6 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
@@ -2,9 +2,11 @@
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,GFX90A-FASTF64 %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32,FASTF16,FASTF64 %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32,SLOWF64 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16-SIZE,GFX90A-FASTF64-SIZE %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,FASTF16-SIZE %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,SLOWF64-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s
 ; END.
 
 define amdgpu_kernel void @fsub_f32() #0 {
@@ -157,3 +159,34 @@ define amdgpu_kernel void @fsub_f16() #0 {
   %v17f16 = fsub <17 x half> undef, undef
   ret void
 }
+
+define amdgpu_kernel void @fsub_bf16() #0 {
+; GFX1250-LABEL: 'fsub_bf16'
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = fsub bfloat undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2bf16 = fsub <2 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = fsub <3 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4bf16 = fsub <4 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5bf16 = fsub <5 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16bf16 = fsub <16 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17bf16 = fsub <17 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX1250-SIZE-LABEL: 'fsub_bf16'
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fsub bfloat undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fsub <2 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = fsub <3 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = fsub <4 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = fsub <5 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = fsub <16 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17bf16 = fsub <17 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = fsub bfloat undef, undef
+  %v2bf16 = fsub <2 x bfloat> undef, undef
+  %v3bf16 = fsub <3 x bfloat> undef, undef
+  %v4bf16 = fsub <4 x bfloat> undef, undef
+  %v5bf16 = fsub <5 x bfloat> undef, undef
+  %v16bf16 = fsub <16 x bfloat> undef, undef
+  %v17bf16 = fsub <17 x bfloat> undef, undef
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index b5d7ddf5aee89..e10141f9ba809 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -49641,6 +49641,7 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v3, 16
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+<<<<<<< HEAD
 ;
 ; GFX1250-LABEL: v_fma_v3bf16:
 ; GFX1250:       ; %bb.0:
@@ -49649,6 +49650,15 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX1250-NEXT:    v_pk_fma_bf16 v0, v0, v2, v4
 ; GFX1250-NEXT:    v_pk_fma_bf16 v1, v1, v3, v5
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+=======
+; GFX1250-LABEL:     v_fma_v3bf16:
+; GFX1250:           %bb.0:
+; GFX1250-NEXT:        s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:        s_wait_kmcnt 0x0
+; GFX1250-NEXT:        v_pk_fma_bf16 v0, v0, v2, v4
+; GFX1250-NEXT:        v_pk_fma_bf16 v1, v1, v3, v5
+; GFX1250-NEXT:        s_set_pc_i64 s[30:31]
+>>>>>>> cc3762e87c75 (Add testing coverage - part I)
   %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
   ret <3 x bfloat> %op
 }
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll
new file mode 100644
index 0000000000000..a4cdb0387df9a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll
@@ -0,0 +1,1292 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250  < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250 %s
+
+declare bfloat @llvm.fabs.bf16(bfloat) #0
+declare bfloat @llvm.canonicalize.bf16(bfloat) #0
+declare <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat>) #0
+declare <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat>) #0
+declare <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat>) #0
+declare <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat>) #0
+declare <6 x bfloat> @llvm.canonicalize.v6bf16(<6 x bfloat>) #0
+declare <8 x bfloat> @llvm.canonicalize.v8bf16(<8 x bfloat>) #0
+declare <12 x bfloat> @llvm.canonicalize.v12bf16(<12 x bfloat>) #0
+declare <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat>) #0
+declare <32 x bfloat> @llvm.canonicalize.v32bf16(<32 x bfloat>) #0
+declare <64 x bfloat> @llvm.canonicalize.v64bf16(<64 x bfloat>) #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+; GFX1250-LABEL:     test_fold_canonicalize_undef_value_bf16:
+; GFX1250:           %bb.0:
+; GFX1250-NEXT:        s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:        v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:        s_wait_kmcnt 0x0
+; GFX1250-NEXT:        global_store_b16 v0, v0, s[0:1]
+; GFX1250-NEXT:        s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_undef_value_bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+
+; GFX1250-LABEL:    v_test_canonicalize_var_bf16:
+; GFX1250:          ; %bb.0:
+; GFX1250-NEXT:       s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:       v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:       s_wait_kmcnt 0x0
+; GFX1250-NEXT:       global_load_u16 v0, v0, s[0:1]
+; GFX1250-NEXT:       s_wait_loadcnt 0x0
+; GFX1250-NEXT:       v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:       s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:       v_max_num_f32_e32 v0, v0, v0
+; GFX1250-NEXT:       v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:       global_store_b16 v[0:1], v0, off
+; GFX1250-NEXT:       s_endpgm
+define amdgpu_kernel void @v_test_canonicalize_var_bf16(ptr addrspace(1) %out) #1 {
+  %val = load bfloat, ptr addrspace(1) %out
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val)
+  store bfloat %canonicalized, ptr addrspace(1) poison
+  ret void
+}
+
+; GFX1250-LABEL:     s_test_canonicalize_var_bf16:
+; GFX1250:           ; %bb.0:
+; GFX1250-NEXT:        s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-NEXT:        v_mov_b32_e32 v1, 0
+; GFX1250-NEXT:        s_wait_kmcnt 0x0
+; GFX1250-NEXT:        s_lshl_b32 s2, s2, 16
+; GFX1250-NEXT:        s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:        v_max_num_f32_e64 v0, s2, s2
+; GFX1250-NEXT:        v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:        global_store_b16 v1, v0, s[0:1]
+; GFX1250-NEXT:        s_endpgm
+define amdgpu_kernel void @s_test_canonicalize_var_bf16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 {
+  %val = bitcast i16 %val.arg to bfloat
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+
+; GFX1250-LABEL:    v_test_canonicalize_build_vector_v2bf16:
+; GFX1250:          ; %bb.0:
+; GFX1250-NEXT:       s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:       s_wait_kmcnt 0x0
+; GFX1250-NEXT:       v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1250-NEXT:       s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:       v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; GFX1250-NEXT:       v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:       s_set_pc_i64 s[30:31]
+define <2 x bfloat> @v_test_canonicalize_build_vector_v2bf16(bfloat %lo, bfloat %hi) #1 {
+  %ins0 = insertelement <2 x bfloat> poison, bfloat %lo, i32 0
+  %ins1 = insertelement <2 x bfloat> %ins0, bfloat %hi, i32 1
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %ins1)
+  ret <2 x bfloat> %canonicalized
+}
+
+
+; GFX1250-LABEL:     v_test_canonicalize_fabs_var_bf16:
+; GFX1250:           ; %bb.0:
+; GFX1250-NEXT:         s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:         v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:         s_wait_kmcnt 0x0
+; GFX1250-NEXT:         global_load_u16 v1, v0, s[0:1]
+; GFX1250-NEXT:         s_wait_loadcnt 0x0
+; GFX1250-NEXT:         v_and_b32_e32 v1, 0x7fff, v1
+; GFX1250-NEXT:         s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:         v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT:         v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:         s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:         v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:         global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:         s_endpgm
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_bf16(ptr addrspace(1) %out) #1 {
+  %val = load bfloat, ptr addrspace(1) %out
+  %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val)
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fabs)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+
+
+; GFX1250-LABEL:     v_test_canonicalize_fneg_fabs_var_bf16:
+; GFX1250:           ; %bb.0:
+; GFX1250-NEXT:        s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:        v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:        s_wait_kmcnt 0x0
+; GFX1250-NEXT:        global_load_u16 v1, v0, s[0:1]
+; GFX1250-NEXT:        s_wait_loadcnt 0x0
+; GFX1250-NEXT:        v_or_b32_e32 v1, 0x8000, v1
+; GFX1250-NEXT:        s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:        v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT:        v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:        s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:        v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:        global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:        s_endpgm
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_bf16(ptr addrspace(1) %out) #1 {
+  %val = load bfloat, ptr addrspace(1) %out
+  %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val)
+  %val.fabs.fneg = fneg bfloat %val.fabs
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fabs.fneg)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+
+; GFX1250-LABEL:    v_test_canonicalize_fneg_var_bf16:
+; GFX1250:          ; %bb.0:
+; GFX1250-NEXT:        s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:        v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:        s_wait_kmcnt 0x0
+; GFX1250-NEXT:        global_load_u16 v1, v0, s[0:1]
+; GFX1250-NEXT:        s_wait_loadcnt 0x0
+; GFX1250-NEXT:        v_xor_b32_e32 v1, 0x8000, v1
+; GFX1250-NEXT:        s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:        v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT:        v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:        s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:        v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:        global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:        s_endpgm
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_bf16(ptr addrspace(1) %out) #1 {
+  %val = load bfloat, ptr addrspace(1) %out
+  %val.fneg = fneg bfloat %val
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fneg)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+
+; GFX1250-LABEL:      v_test_no_denormals_canonicalize_fneg_var_bf16:
+; GFX1250:            ; %bb.0:
+; GFX1250-NEXT:         s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:         v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:         s_wait_kmcnt 0x0
+; GFX1250-NEXT:         global_load_u16 v1, v0, s[0:1]
+; GFX1250-NEXT:         s_wait_loadcnt 0x0
+; GFX1250-NEXT:         v_xor_b32_e32 v1, 0x8000, v1
+; GFX1250-NEXT:         s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:         v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT:         v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:         s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:         v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:         global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:         s_endpgm
+define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_bf16(ptr addrspace(1) %out) #2 {
+  %val = load bfloat, ptr addrspace(1) %out
+  %val.fneg = fneg bfloat %val
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fneg)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+
+; GFX1250-LABEL:      v_test_no_denormals_canonicalize_fneg_fabs_var_bf16:
+; GFX1250: ;          %bb.0:
+; GFX1250-NEXT:         s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:         v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:         s_wait_kmcnt 0x0
+; GFX1250-NEXT:         global_load_u16 v1, v0, s[0:1]
+; GFX1250-NEXT:         s_wait_loadcnt 0x0
+; GFX1250-NEXT:         v_or_b32_e32 v1, 0x8000, v1
+; GFX1250-NEXT:         s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:         v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT:         v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:         s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:         v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:         global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:         s_endpgm
+define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_bf16(ptr addrspace(1) %out) #2 {
+  %val = load bfloat, ptr addrspace(1) %out
+  %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val)
+  %val.fabs.fneg = fneg bfloat %val.fabs
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fabs.fneg)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+
+; GFX1250-LABEL: test_fold_canonicalize_p0_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b16 v0, v0, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+ define amdgpu_kernel void @test_fold_canonicalize_p0_bf16(ptr addrspace(1) %out) #1 {
+   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0.0)
+   store bfloat %canonicalized, ptr addrspace(1) %out
+   ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_n0_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+; GFX1250-NEXT: .Lfunc_end10:
+define amdgpu_kernel void @test_fold_canonicalize_n0_bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat -0.0)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_p1_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f80
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_p1_bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 1.0)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_n1_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbf80
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+; GFX1250-NEXT: .Lfunc_end12:
+define amdgpu_kernel void @test_fold_canonicalize_n1_bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat -1.0)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_literal_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4180
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_literal_bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 16.0)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal0_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR03FF)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal0_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_bf16(ptr addrspace(1) %out) #3 {
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR03FF)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal1_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR83FF)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal1_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_bf16(ptr addrspace(1) %out) #3 {
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR83FF)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_qnan_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_qnan_bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR7C00)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg1_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 -1 to bfloat))
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg2_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 -2 to bfloat))
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_snan0_value_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c01
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR7C01)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_snan1_value_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7dff
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR7DFF)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_snan2_value_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffdff
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xRFDFF)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_snan3_value_bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffc01
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xRFC01)
+  store bfloat %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: v_test_canonicalize_var_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: 	v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_load_b32 v0, v0, s[0:1] scale_offset
+; GFX1250-NEXT: 	s_wait_loadcnt 0x0
+; GFX1250-NEXT: 	v_and_b32_e32 v1, 0xffff0000, v0
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: 	v_dual_max_num_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1250-NEXT: 	v_max_num_f32_e32 v0, v0, v0
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: 	global_store_b32 v2, v0, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @v_test_canonicalize_var_v2bf16(ptr addrspace(1) %out) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid
+  %val = load <2 x bfloat>, ptr addrspace(1) %gep
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %val)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: v_test_canonicalize_fabs_var_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: 	v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_load_b32 v0, v0, s[0:1] scale_offset
+; GFX1250-NEXT: 	s_wait_loadcnt 0x0
+; GFX1250-NEXT: 	v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-NEXT: 	v_and_b32_e32 v0, 0x7fff, v0
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: 	v_and_b32_e32 v1, 0x7fff, v1
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: 	v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: 	v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: 	global_store_b32 v2, v0, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2bf16(ptr addrspace(1) %out) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid
+  %val = load <2 x bfloat>, ptr addrspace(1) %gep
+  %val.fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %val)
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %val.fabs)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: v_test_canonicalize_fneg_fabs_var_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: 	v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_load_b32 v0, v0, s[0:1] scale_offset
+; GFX1250-NEXT: 	s_wait_loadcnt 0x0
+; GFX1250-NEXT: 	v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-NEXT: 	v_or_b32_e32 v0, 0x8000, v0
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: 	v_or_b32_e32 v1, 0x8000, v1
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: 	v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: 	v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: 	global_store_b32 v2, v0, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2bf16(ptr addrspace(1) %out) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid
+  %val = load <2 x bfloat>, ptr addrspace(1) %gep
+  %val.fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %val)
+  %val.fabs.fneg = fneg <2 x bfloat> %val.fabs
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %val.fabs.fneg)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: v_test_canonicalize_fneg_var_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: 	v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_load_b32 v0, v0, s[0:1] scale_offset
+; GFX1250-NEXT: 	s_wait_loadcnt 0x0
+; GFX1250-NEXT: 	v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-NEXT: 	v_xor_b32_e32 v0, 0x8000, v0
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: 	v_xor_b32_e32 v1, 0x8000, v1
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: 	v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: 	v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: 	global_store_b32 v2, v0, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2bf16(ptr addrspace(1) %out) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid
+  %val = load <2 x bfloat>, ptr addrspace(1) %gep
+  %fneg.val = fneg <2 x bfloat> %val
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %fneg.val)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: s_test_canonicalize_var_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-NEXT: 	v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	s_and_b32 s3, s2, 0xffff0000
+; GFX1250-NEXT: 	s_lshl_b32 s2, s2, 16
+; GFX1250-NEXT: 	v_max_num_f32_e64 v0, s3, s3
+; GFX1250-NEXT: 	v_max_num_f32_e64 v1, s2, s2
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v1, v0
+; GFX1250-NEXT: 	global_store_b32 v2, v0, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @s_test_canonicalize_var_v2bf16(ptr addrspace(1) %out, i32 zeroext %val.arg) #1 {
+  %val = bitcast i32 %val.arg to <2 x bfloat>
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %val)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_p0_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b32 v0, v0, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_p0_v2bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> zeroinitializer)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_n0_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_n0_v2bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat -0.0, bfloat -0.0>)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_p1_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f803f80
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_p1_v2bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 1.0, bfloat 1.0>)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_n1_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbf80bf80
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_n1_v2bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat -1.0, bfloat -1.0>)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_literal_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41804180
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_literal_v2bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 16.0, bfloat 16.0>)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 0xR03FF, bfloat 0xR03FF>)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal0_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2bf16(ptr addrspace(1) %out) #3 {
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 0xR03FF, bfloat 0xR03FF>)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 0xR83FF, bfloat 0xR83FF>)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal1_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2bf16(ptr addrspace(1) %out) #3 {
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 0xR83FF, bfloat 0xR83FF>)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_qnan_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_qnan_v2bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 0xR7C00, bfloat 0xR7C00>)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg1_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc07fc0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> bitcast (i32 -1 to <2 x bfloat>))
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg2_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc07fc0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat bitcast (i16 -2 to bfloat), bfloat bitcast (i16 -2 to bfloat)>)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_snan0_value_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c017c01
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 0xR7C01, bfloat 0xR7C01>)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_snan1_value_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7dff7dff
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 0xR7DFF, bfloat 0xR7DFF>)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_snan2_value_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfdfffdff
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 0xRFDFF, bfloat 0xRFDFF>)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: test_fold_canonicalize_snan3_value_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfc01fc01
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 0xRFC01, bfloat 0xRFC01>)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: v_test_canonicalize_var_v3bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	v_and_b32_e32 v2, 0xffff0000, v0
+; GFX1250-NEXT: 	v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: 	v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v0, v0, v0
+; GFX1250-NEXT: 	v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v2
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <3 x bfloat> @v_test_canonicalize_var_v3bf16(<3 x bfloat> %val) #1 {
+  %canonicalized = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> %val)
+  ret <3 x bfloat> %canonicalized
+}
+; GFX1250-LABEL: v_test_canonicalize_var_v4bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1250-NEXT: 	v_and_b32_e32 v3, 0xffff0000, v0
+; GFX1250-NEXT: 	v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: 	v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
+; GFX1250-NEXT: 	v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v1, v1, v2
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <4 x bfloat> @v_test_canonicalize_var_v4bf16(<4 x bfloat> %val) #1 {
+  %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %val)
+  ret <4 x bfloat> %canonicalized
+}
+; GFX1250-LABEL: s_test_canonicalize_undef_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b32 v0, v0, s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @s_test_canonicalize_undef_v2bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+  store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: v_test_canonicalize_reg_undef_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: 	v_max_num_f32_e32 v0, v0, v0
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: 	s_movk_i32 s0, 0x7fc0
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: 	v_perm_b32 v0, s0, v0, 0x5040100
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <2 x bfloat> @v_test_canonicalize_reg_undef_v2bf16(bfloat %val) #1 {
+  %vec = insertelement <2 x bfloat> poison, bfloat %val, i32 0
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec)
+  ret <2 x bfloat> %canonicalized
+}
+; GFX1250-LABEL: v_test_canonicalize_undef_reg_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: 	v_max_num_f32_e32 v0, v0, v0
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: 	s_movk_i32 s0, 0x7fc0
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: 	v_perm_b32 v0, v0, s0, 0x5040100
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <2 x bfloat> @v_test_canonicalize_undef_reg_v2bf16(bfloat %val) #1 {
+  %vec = insertelement <2 x bfloat> poison, bfloat %val, i32 1
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec)
+  ret <2 x bfloat> %canonicalized
+}
+; GFX1250-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	v_mov_b32_e32 v0, 1.0
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <2 x bfloat> @v_test_canonicalize_undef_lo_imm_hi_v2bf16() #1 {
+  %vec = insertelement <2 x bfloat> undef, bfloat 1.0, i32 1
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec)
+  ret <2 x bfloat> %canonicalized
+}
+; GFX1250-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	v_mov_b32_e32 v0, 0x3f80
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <2 x bfloat> @v_test_canonicalize_imm_lo_undef_hi_v2bf16() #1 {
+  %vec = insertelement <2 x bfloat> undef, bfloat 1.0, i32 0
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec)
+  ret <2 x bfloat> %canonicalized
+}
+; GFX1250-LABEL: v_test_canonicalize_undef_lo_k_hi_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	v_mov_b32_e32 v0, 0x41800000
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <2 x bfloat> @v_test_canonicalize_undef_lo_k_hi_v2bf16() #1 {
+  %vec = insertelement <2 x bfloat> undef, bfloat 16.0, i32 1
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec)
+  ret <2 x bfloat> %canonicalized
+}
+; GFX1250-LABEL: v_test_canonicalize_k_lo_undef_hi_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	v_mov_b32_e32 v0, 0x4180
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <2 x bfloat> @v_test_canonicalize_k_lo_undef_hi_v2bf16() #1 {
+  %vec = insertelement <2 x bfloat> undef, bfloat 16.0, i32 0
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec)
+  ret <2 x bfloat> %canonicalized
+}
+; GFX1250-LABEL: v_test_canonicalize_reg_k_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: 	v_max_num_f32_e32 v0, v0, v0
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: 	s_movk_i32 s0, 0x4000
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: 	v_perm_b32 v0, s0, v0, 0x5040100
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <2 x bfloat> @v_test_canonicalize_reg_k_v2bf16(bfloat %val) #1 {
+  %vec0 = insertelement <2 x bfloat> poison, bfloat %val, i32 0
+  %vec1 = insertelement <2 x bfloat> %vec0, bfloat 2.0, i32 1
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec1)
+  ret <2 x bfloat> %canonicalized
+}
+; GFX1250-LABEL: v_test_canonicalize_k_reg_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: 	v_max_num_f32_e32 v0, v0, v0
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: 	s_movk_i32 s0, 0x4000
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: 	v_perm_b32 v0, v0, s0, 0x5040100
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <2 x bfloat> @v_test_canonicalize_k_reg_v2bf16(bfloat %val) #1 {
+  %vec0 = insertelement <2 x bfloat> poison, bfloat 2.0, i32 0
+  %vec1 = insertelement <2 x bfloat> %vec0, bfloat %val, i32 1
+  %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec1)
+  ret <2 x bfloat> %canonicalized
+}
+; GFX1250-LABEL: s_test_canonicalize_undef_v4bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: 	v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: 	v_mov_b32_e32 v1, v0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	global_store_b64 v0, v[0:1], s[0:1]
+; GFX1250-NEXT: 	s_endpgm
+define amdgpu_kernel void @s_test_canonicalize_undef_v4bf16(ptr addrspace(1) %out) #1 {
+  %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+  store <4 x bfloat> %canonicalized, ptr addrspace(1) %out
+  ret void
+}
+; GFX1250-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	v_dual_mov_b32 v1, 0x7fc07fc0 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: 	v_max_num_f32_e32 v0, v0, v0
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: 	s_movk_i32 s0, 0x7fc0
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: 	v_perm_b32 v0, s0, v0, 0x5040100
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <4 x bfloat> @v_test_canonicalize_reg_undef_undef_undef_v4bf16(bfloat %val) #1 {
+  %vec = insertelement <4 x bfloat> poison, bfloat %val, i32 0
+  %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %vec)
+  ret <4 x bfloat> %canonicalized
+}
+; GFX1250-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: 	v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT: 	v_mov_b32_e32 v1, 0x7fc07fc0
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <4 x bfloat> @v_test_canonicalize_reg_reg_undef_undef_v4bf16(bfloat %val0, bfloat %val1) #1 {
+  %vec0 = insertelement <4 x bfloat> poison, bfloat %val0, i32 0
+  %vec1 = insertelement <4 x bfloat> %vec0, bfloat %val1, i32 1
+  %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %vec1)
+  ret <4 x bfloat> %canonicalized
+}
+; GFX1250-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: 	v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: 	v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v1, v1, v1
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT: 	s_movk_i32 s0, 0x7fc0
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v1, v1, v2
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: 	v_perm_b32 v0, s0, v0, 0x5040100
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <4 x bfloat> @v_test_canonicalize_reg_undef_reg_reg_v4bf16(bfloat %val0, bfloat %val1, bfloat %val2) #1 {
+  %vec0 = insertelement <4 x bfloat> poison, bfloat %val0, i32 0
+  %vec1 = insertelement <4 x bfloat> %vec0, bfloat %val1, i32 2
+  %vec2 = insertelement <4 x bfloat> %vec1, bfloat %val2, i32 3
+  %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %vec2)
+  ret <4 x bfloat> %canonicalized
+}
+; GFX1250-LABEL: v_test_canonicalize_var_v6bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	v_and_b32_e32 v3, 0xffff0000, v2
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v2, 16, v2
+; GFX1250-NEXT: 	v_and_b32_e32 v4, 0xffff0000, v1
+; GFX1250-NEXT: 	v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT: 	v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: 	v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4
+; GFX1250-NEXT: 	v_dual_max_num_f32 v5, v5, v5 :: v_dual_max_num_f32 v0, v0, v0
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: 	v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v5
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v1, v1, v4
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v2, v2, v3
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <6 x bfloat> @v_test_canonicalize_var_v6bf16(<6 x bfloat> %val) #1 {
+  %canonicalized = call <6 x bfloat> @llvm.canonicalize.v6bf16(<6 x bfloat> %val)
+  ret <6 x bfloat> %canonicalized
+}
+; GFX1250-LABEL: v_test_canonicalize_var_v8bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	v_and_b32_e32 v5, 0xffff0000, v2
+; GFX1250-NEXT: 	v_and_b32_e32 v4, 0xffff0000, v3
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v3, 16, v3
+; GFX1250-NEXT: 	v_and_b32_e32 v6, 0xffff0000, v1
+; GFX1250-NEXT: 	v_and_b32_e32 v7, 0xffff0000, v0
+; GFX1250-NEXT: 	v_dual_max_num_f32 v5, v5, v5 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1250-NEXT: 	v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: 	v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v6, v6, v6
+; GFX1250-NEXT: 	v_dual_max_num_f32 v7, v7, v7 :: v_dual_max_num_f32 v0, v0, v0
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: 	v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2
+; GFX1250-NEXT: 	v_max_num_f32_e32 v3, v3, v3
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v7
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v1, v1, v6
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v2, v2, v5
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_4)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v3, v3, v4
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <8 x bfloat> @v_test_canonicalize_var_v8bf16(<8 x bfloat> %val) #1 {
+  %canonicalized = call <8 x bfloat> @llvm.canonicalize.v8bf16(<8 x bfloat> %val)
+  ret <8 x bfloat> %canonicalized
+}
+; GFX1250-LABEL: v_test_canonicalize_var_v12bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	v_and_b32_e32 v6, 0xffff0000, v5
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v5, 16, v5
+; GFX1250-NEXT: 	v_and_b32_e32 v7, 0xffff0000, v4
+; GFX1250-NEXT: 	v_and_b32_e32 v8, 0xffff0000, v3
+; GFX1250-NEXT: 	v_and_b32_e32 v9, 0xffff0000, v2
+; GFX1250-NEXT: 	v_dual_lshlrev_b32 v4, 16, v4 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX1250-NEXT: 	v_and_b32_e32 v10, 0xffff0000, v1
+; GFX1250-NEXT: 	v_and_b32_e32 v11, 0xffff0000, v0
+; GFX1250-NEXT: 	v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v2, 16, v2
+; GFX1250-NEXT: 	v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v5, v5, v5
+; GFX1250-NEXT: 	v_dual_max_num_f32 v7, v7, v7 :: v_dual_max_num_f32 v8, v8, v8
+; GFX1250-NEXT: 	v_dual_max_num_f32 v9, v9, v9 :: v_dual_max_num_f32 v10, v10, v10
+; GFX1250-NEXT: 	v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v11, v11, v11
+; GFX1250-NEXT: 	v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2
+; GFX1250-NEXT: 	v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v11
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v1, v1, v10
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v2, v2, v9
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v3, v3, v8
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v4, v4, v7
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v5, v5, v6
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <12 x bfloat> @v_test_canonicalize_var_v12bf16(<12 x bfloat> %val) #1 {
+  %canonicalized = call <12 x bfloat> @llvm.canonicalize.v12bf16(<12 x bfloat> %val)
+  ret <12 x bfloat> %canonicalized
+}
+; GFX1250-LABEL: v_test_canonicalize_var_v16bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	v_and_b32_e32 v8, 0xffff0000, v7
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v7, 16, v7
+; GFX1250-NEXT: 	v_and_b32_e32 v9, 0xffff0000, v6
+; GFX1250-NEXT: 	v_and_b32_e32 v10, 0xffff0000, v5
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: 	v_dual_max_num_f32 v8, v8, v8 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX1250-NEXT: 	v_dual_max_num_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX1250-NEXT: 	v_and_b32_e32 v11, 0xffff0000, v4
+; GFX1250-NEXT: 	v_and_b32_e32 v12, 0xffff0000, v3
+; GFX1250-NEXT: 	v_and_b32_e32 v13, 0xffff0000, v2
+; GFX1250-NEXT: 	v_dual_lshlrev_b32 v4, 16, v4 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX1250-NEXT: 	v_and_b32_e32 v14, 0xffff0000, v1
+; GFX1250-NEXT: 	v_and_b32_e32 v15, 0xffff0000, v0
+; GFX1250-NEXT: 	v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: 	v_dual_max_num_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX1250-NEXT: 	v_max_num_f32_e32 v6, v6, v6
+; GFX1250-NEXT: 	v_dual_max_num_f32 v10, v10, v10 :: v_dual_max_num_f32 v5, v5, v5
+; GFX1250-NEXT: 	v_dual_max_num_f32 v11, v11, v11 :: v_dual_max_num_f32 v12, v12, v12
+; GFX1250-NEXT: 	v_dual_max_num_f32 v13, v13, v13 :: v_dual_max_num_f32 v14, v14, v14
+; GFX1250-NEXT: 	v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v15, v15, v15
+; GFX1250-NEXT: 	v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2
+; GFX1250-NEXT: 	v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v15
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v1, v1, v14
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v2, v2, v13
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v3, v3, v12
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v4, v4, v11
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v5, v5, v10
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v6, v6, v9
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v7, v7, v8
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <16 x bfloat> @v_test_canonicalize_var_v16bf16(<16 x bfloat> %val) #1 {
+  %canonicalized = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> %val)
+  ret <16 x bfloat> %canonicalized
+}
+; GFX1250-LABEL: v_test_canonicalize_var_v32bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	v_and_b32_e32 v16, 0xffff0000, v15
+; GFX1250-NEXT: 	v_and_b32_e32 v18, 0xffff0000, v13
+; GFX1250-NEXT: 	v_and_b32_e32 v20, 0xffff0000, v11
+; GFX1250-NEXT: 	v_and_b32_e32 v22, 0xffff0000, v9
+; GFX1250-NEXT: 	v_and_b32_e32 v24, 0xffff0000, v7
+; GFX1250-NEXT: 	v_dual_max_num_f32 v16, v16, v16 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX1250-NEXT: 	v_and_b32_e32 v17, 0xffff0000, v14
+; GFX1250-NEXT: 	v_dual_lshlrev_b32 v14, 16, v14 :: v_dual_lshlrev_b32 v13, 16, v13
+; GFX1250-NEXT: 	v_max_num_f32_e32 v18, v18, v18
+; GFX1250-NEXT: 	v_and_b32_e32 v19, 0xffff0000, v12
+; GFX1250-NEXT: 	v_dual_lshlrev_b32 v12, 16, v12 :: v_dual_lshlrev_b32 v11, 16, v11
+; GFX1250-NEXT: 	v_max_num_f32_e32 v20, v20, v20
+; GFX1250-NEXT: 	v_and_b32_e32 v21, 0xffff0000, v10
+; GFX1250-NEXT: 	v_dual_lshlrev_b32 v10, 16, v10 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX1250-NEXT: 	v_max_num_f32_e32 v22, v22, v22
+; GFX1250-NEXT: 	v_and_b32_e32 v23, 0xffff0000, v8
+; GFX1250-NEXT: 	v_dual_lshlrev_b32 v8, 16, v8 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX1250-NEXT: 	v_max_num_f32_e32 v24, v24, v24
+; GFX1250-NEXT: 	v_and_b32_e32 v25, 0xffff0000, v6
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v6, 16, v6
+; GFX1250-NEXT: 	v_and_b32_e32 v26, 0xffff0000, v5
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v5, 16, v5
+; GFX1250-NEXT: 	v_and_b32_e32 v27, 0xffff0000, v4
+; GFX1250-NEXT: 	v_and_b32_e32 v28, 0xffff0000, v3
+; GFX1250-NEXT: 	v_and_b32_e32 v29, 0xffff0000, v2
+; GFX1250-NEXT: 	v_dual_lshlrev_b32 v4, 16, v4 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX1250-NEXT: 	v_and_b32_e32 v30, 0xffff0000, v1
+; GFX1250-NEXT: 	v_and_b32_e32 v31, 0xffff0000, v0
+; GFX1250-NEXT: 	v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT: 	v_dual_max_num_f32 v15, v15, v15 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX1250-NEXT: 	v_dual_max_num_f32 v17, v17, v17 :: v_dual_max_num_f32 v14, v14, v14
+; GFX1250-NEXT: 	v_dual_max_num_f32 v13, v13, v13 :: v_dual_max_num_f32 v19, v19, v19
+; GFX1250-NEXT: 	v_dual_max_num_f32 v12, v12, v12 :: v_dual_max_num_f32 v11, v11, v11
+; GFX1250-NEXT: 	v_dual_max_num_f32 v21, v21, v21 :: v_dual_max_num_f32 v10, v10, v10
+; GFX1250-NEXT: 	v_dual_max_num_f32 v9, v9, v9 :: v_dual_max_num_f32 v23, v23, v23
+; GFX1250-NEXT: 	v_dual_max_num_f32 v8, v8, v8 :: v_dual_max_num_f32 v7, v7, v7
+; GFX1250-NEXT: 	v_dual_max_num_f32 v25, v25, v25 :: v_dual_max_num_f32 v6, v6, v6
+; GFX1250-NEXT: 	v_dual_max_num_f32 v26, v26, v26 :: v_dual_max_num_f32 v5, v5, v5
+; GFX1250-NEXT: 	v_dual_max_num_f32 v27, v27, v27 :: v_dual_max_num_f32 v28, v28, v28
+; GFX1250-NEXT: 	v_dual_max_num_f32 v29, v29, v29 :: v_dual_max_num_f32 v30, v30, v30
+; GFX1250-NEXT: 	v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v31, v31, v31
+; GFX1250-NEXT: 	v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2
+; GFX1250-NEXT: 	v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v31
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v1, v1, v30
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v2, v2, v29
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v3, v3, v28
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v4, v4, v27
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v5, v5, v26
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v6, v6, v25
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v7, v7, v24
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v8, v8, v23
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v9, v9, v22
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v10, v10, v21
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v11, v11, v20
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v12, v12, v19
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v13, v13, v18
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v14, v14, v17
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v15, v15, v16
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <32 x bfloat> @v_test_canonicalize_var_v32bf16(<32 x bfloat> %val) #1 {
+  %canonicalized = call <32 x bfloat> @llvm.canonicalize.v32bf16(<32 x bfloat> %val)
+  ret <32 x bfloat> %canonicalized
+}
+; GFX1250-LABEL: v_test_canonicalize_var_v64bf16:
+; GFX1250:  %bb.0:
+; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: 	s_wait_kmcnt 0x0
+; GFX1250-NEXT: 	scratch_load_b32 v31, off, s32
+; GFX1250-NEXT: 	v_and_b32_e32 v81, 0xffff0000, v0
+; GFX1250-NEXT: 	v_and_b32_e32 v38, 0xffff0000, v24
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v24, 16, v24
+; GFX1250-NEXT: 	v_and_b32_e32 v39, 0xffff0000, v23
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v23, 16, v23
+; GFX1250-NEXT: 	v_and_b32_e32 v80, 0xffff0000, v6
+; GFX1250-NEXT: 	v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX1250-NEXT: 	v_and_b32_e32 v82, 0xffff0000, v1
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT: 	v_max_num_f32_e32 v81, v81, v81
+; GFX1250-NEXT: 	v_and_b32_e32 v83, 0xffff0000, v2
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v2, 16, v2
+; GFX1250-NEXT: 	v_and_b32_e32 v34, 0xffff0000, v28
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v28, 16, v28
+; GFX1250-NEXT: 	v_and_b32_e32 v35, 0xffff0000, v27
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v27, 16, v27
+; GFX1250-NEXT: 	v_and_b32_e32 v36, 0xffff0000, v26
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v26, 16, v26
+; GFX1250-NEXT: 	v_and_b32_e32 v48, 0xffff0000, v22
+; GFX1250-NEXT: 	v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v82, v82, v82
+; GFX1250-NEXT: 	v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v83, v83, v83
+; GFX1250-NEXT: 	v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v24, v24, v24
+; GFX1250-NEXT: 	v_max_num_f32_e32 v39, v39, v39
+; GFX1250-NEXT: 	v_dual_max_num_f32 v23, v23, v23 :: v_dual_max_num_f32 v48, v48, v48
+; GFX1250-NEXT: 	v_and_b32_e32 v32, 0xffff0000, v30
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v30, 16, v30
+; GFX1250-NEXT: 	v_and_b32_e32 v33, 0xffff0000, v29
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v29, 16, v29
+; GFX1250-NEXT: 	v_and_b32_e32 v37, 0xffff0000, v25
+; GFX1250-NEXT: 	v_dual_lshlrev_b32 v25, 16, v25 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX1250-NEXT: 	v_and_b32_e32 v49, 0xffff0000, v21
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v21, 16, v21
+; GFX1250-NEXT: 	v_and_b32_e32 v50, 0xffff0000, v20
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v20, 16, v20
+; GFX1250-NEXT: 	v_and_b32_e32 v51, 0xffff0000, v19
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v19, 16, v19
+; GFX1250-NEXT: 	v_and_b32_e32 v52, 0xffff0000, v18
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v18, 16, v18
+; GFX1250-NEXT: 	v_and_b32_e32 v53, 0xffff0000, v17
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v17, 16, v17
+; GFX1250-NEXT: 	v_and_b32_e32 v54, 0xffff0000, v16
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v16, 16, v16
+; GFX1250-NEXT: 	v_and_b32_e32 v55, 0xffff0000, v15
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v15, 16, v15
+; GFX1250-NEXT: 	v_and_b32_e32 v64, 0xffff0000, v14
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v14, 16, v14
+; GFX1250-NEXT: 	v_and_b32_e32 v65, 0xffff0000, v13
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v13, 16, v13
+; GFX1250-NEXT: 	v_and_b32_e32 v66, 0xffff0000, v12
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v12, 16, v12
+; GFX1250-NEXT: 	v_and_b32_e32 v67, 0xffff0000, v11
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v11, 16, v11
+; GFX1250-NEXT: 	v_and_b32_e32 v68, 0xffff0000, v10
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v10, 16, v10
+; GFX1250-NEXT: 	v_and_b32_e32 v69, 0xffff0000, v9
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v9, 16, v9
+; GFX1250-NEXT: 	v_and_b32_e32 v70, 0xffff0000, v8
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v8, 16, v8
+; GFX1250-NEXT: 	v_and_b32_e32 v71, 0xffff0000, v7
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v7, 16, v7
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v81
+; GFX1250-NEXT: 	v_and_b32_e32 v81, 0xffff0000, v5
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v5, 16, v5
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v1, v1, v82
+; GFX1250-NEXT: 	v_and_b32_e32 v82, 0xffff0000, v4
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v4, 16, v4
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v2, v2, v83
+; GFX1250-NEXT: 	v_and_b32_e32 v83, 0xffff0000, v3
+; GFX1250-NEXT: 	v_dual_max_num_f32 v32, v32, v32 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX1250-NEXT: 	v_dual_max_num_f32 v27, v27, v27 :: v_dual_max_num_f32 v36, v36, v36
+; GFX1250-NEXT: 	v_dual_max_num_f32 v26, v26, v26 :: v_dual_max_num_f32 v37, v37, v37
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v23, v23, v39
+; GFX1250-NEXT: 	v_dual_max_num_f32 v30, v30, v30 :: v_dual_max_num_f32 v33, v33, v33
+; GFX1250-NEXT: 	v_dual_max_num_f32 v29, v29, v29 :: v_dual_max_num_f32 v34, v34, v34
+; GFX1250-NEXT: 	v_dual_max_num_f32 v28, v28, v28 :: v_dual_max_num_f32 v35, v35, v35
+; GFX1250-NEXT: 	v_dual_max_num_f32 v25, v25, v25 :: v_dual_max_num_f32 v38, v38, v38
+; GFX1250-NEXT: 	v_dual_max_num_f32 v22, v22, v22 :: v_dual_max_num_f32 v49, v49, v49
+; GFX1250-NEXT: 	v_dual_max_num_f32 v21, v21, v21 :: v_dual_max_num_f32 v50, v50, v50
+; GFX1250-NEXT: 	v_dual_max_num_f32 v20, v20, v20 :: v_dual_max_num_f32 v51, v51, v51
+; GFX1250-NEXT: 	v_dual_max_num_f32 v19, v19, v19 :: v_dual_max_num_f32 v52, v52, v52
+; GFX1250-NEXT: 	v_dual_max_num_f32 v18, v18, v18 :: v_dual_max_num_f32 v53, v53, v53
+; GFX1250-NEXT: 	v_dual_max_num_f32 v17, v17, v17 :: v_dual_max_num_f32 v54, v54, v54
+; GFX1250-NEXT: 	v_dual_max_num_f32 v16, v16, v16 :: v_dual_max_num_f32 v55, v55, v55
+; GFX1250-NEXT: 	v_dual_max_num_f32 v15, v15, v15 :: v_dual_max_num_f32 v64, v64, v64
+; GFX1250-NEXT: 	v_dual_max_num_f32 v14, v14, v14 :: v_dual_max_num_f32 v65, v65, v65
+; GFX1250-NEXT: 	v_dual_max_num_f32 v13, v13, v13 :: v_dual_max_num_f32 v66, v66, v66
+; GFX1250-NEXT: 	v_dual_max_num_f32 v12, v12, v12 :: v_dual_max_num_f32 v67, v67, v67
+; GFX1250-NEXT: 	v_dual_max_num_f32 v11, v11, v11 :: v_dual_max_num_f32 v68, v68, v68
+; GFX1250-NEXT: 	v_dual_max_num_f32 v10, v10, v10 :: v_dual_max_num_f32 v69, v69, v69
+; GFX1250-NEXT: 	v_dual_max_num_f32 v9, v9, v9 :: v_dual_max_num_f32 v70, v70, v70
+; GFX1250-NEXT: 	v_dual_max_num_f32 v8, v8, v8 :: v_dual_max_num_f32 v71, v71, v71
+; GFX1250-NEXT: 	v_dual_max_num_f32 v80, v80, v80 :: v_dual_max_num_f32 v81, v81, v81
+; GFX1250-NEXT: 	v_dual_max_num_f32 v82, v82, v82 :: v_dual_max_num_f32 v83, v83, v83
+; GFX1250-NEXT: 	v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4
+; GFX1250-NEXT: 	v_dual_max_num_f32 v5, v5, v5 :: v_dual_max_num_f32 v6, v6, v6
+; GFX1250-NEXT: 	v_max_num_f32_e32 v7, v7, v7
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v26, v26, v36
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_4)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v3, v3, v83
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v4, v4, v82
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v5, v5, v81
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v6, v6, v80
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v7, v7, v71
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v8, v8, v70
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v9, v9, v69
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v10, v10, v68
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v11, v11, v67
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v12, v12, v66
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v13, v13, v65
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v14, v14, v64
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v15, v15, v55
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v16, v16, v54
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v17, v17, v53
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v18, v18, v52
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v19, v19, v51
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v20, v20, v50
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v21, v21, v49
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v22, v22, v48
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v24, v24, v38
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v25, v25, v37
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v27, v27, v35
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v28, v28, v34
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v29, v29, v33
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v30, v30, v32
+; GFX1250-NEXT: 	s_wait_loadcnt 0x0
+; GFX1250-NEXT: 	v_and_b32_e32 v39, 0xffff0000, v31
+; GFX1250-NEXT: 	v_lshlrev_b32_e32 v31, 16, v31
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: 	v_max_num_f32_e32 v36, v39, v39
+; GFX1250-NEXT: 	v_max_num_f32_e32 v31, v31, v31
+; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v31, v31, v36
+; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+define <64 x bfloat> @v_test_canonicalize_var_v64bf16(<64 x bfloat> %val) #1 {
+  %canonicalized = call <64 x bfloat> @llvm.canonicalize.v64bf16(<64 x bfloat> %val)
+  ret <64 x bfloat> %canonicalized
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
+attributes #3 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }

From 32d932f3f761c995f03d9e8f1db90bfcea99375a Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@amd.com>
Date: Wed, 24 Sep 2025 00:15:40 +0100
Subject: [PATCH 3/4] Fix CostModel for bf16

---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  2 +-
 llvm/test/Analysis/CostModel/AMDGPU/fadd.ll   | 22 ++++-----
 llvm/test/Analysis/CostModel/AMDGPU/fmul.ll   | 48 +++++++++----------
 llvm/test/Analysis/CostModel/AMDGPU/fsub.ll   | 26 +++++-----
 4 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index b07e936c494f6..03d16fdd54c42 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -615,7 +615,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
     if (ST->has16BitInsts() && SLT == MVT::f16)
       NElts = (NElts + 1) / 2;
 
-    if (SLT == MVT::f32 || SLT == MVT::f16)
+    if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
       return LT.first * NElts * getFullRateInstrCost();
     break;
   case ISD::FDIV:
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
index d9729479f7410..3538c07938530 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
@@ -162,22 +162,22 @@ define amdgpu_kernel void @fadd_f16() #0 {
 
 define amdgpu_kernel void @fadd_bf16() #0 {
 ; GFX1250-LABEL: 'fadd_bf16'
-; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %bf16 = fadd bfloat undef, undef
-; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v2bf16 = fadd <2 x bfloat> undef, undef
-; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v3bf16 = fadd <3 x bfloat> undef, undef
-; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v4bf16 = fadd <4 x bfloat> undef, undef
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %bf16 = fadd bfloat undef, undef
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v2bf16 = fadd <2 x bfloat> undef, undef
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v3bf16 = fadd <3 x bfloat> undef, undef
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v4bf16 = fadd <4 x bfloat> undef, undef
 ; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v5bf16 = fadd <5 x bfloat> undef, undef
-; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v16bf16 = fadd <16 x bfloat> undef, undef
-; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v17bf16 = fadd <17 x bfloat> undef, undef
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction:   %v16bf16 = fadd <16 x bfloat> undef, undef
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction:   %v17bf16 = fadd <17 x bfloat> undef, undef
 ; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction:   ret void
 ; GFX1250-SIZE-LABEL: 'fadd_bf16'
 ; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %bf16 = fadd bfloat undef, undef
 ; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v2bf16 = fadd <2 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v3bf16 = fadd <3 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v4bf16 = fadd <4 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v5bf16 = fadd <5 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v16bf16 = fadd <16 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v17bf16 = fadd <17 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v3bf16 = fadd <3 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v4bf16 = fadd <4 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v5bf16 = fadd <5 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction:   %v16bf16 = fadd <16 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction:   %v17bf16 = fadd <17 x bfloat> undef, undef
 ; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   ret void
   %bf16 = fadd bfloat undef, undef
   %v2bf16 = fadd <2 x bfloat> undef, undef
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
index 5315852d3225c..c022046fbf1df 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
@@ -162,13 +162,13 @@ define amdgpu_kernel void @fmul_f16() #0 {
 
 define amdgpu_kernel void @fmul_bf16() #0 {
 ; GFX9-LABEL: 'fmul_bf16'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = fmul bfloat undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOW-LABEL: 'fmul_bf16'
@@ -182,23 +182,23 @@ define amdgpu_kernel void @fmul_bf16() #0 {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX1250-LABEL: 'fmul_bf16'
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = fmul bfloat undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
 ; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
 ; GFX1250-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'fmul_bf16'
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLOW-SIZE-LABEL: 'fmul_bf16'
@@ -214,11 +214,11 @@ define amdgpu_kernel void @fmul_bf16() #0 {
 ; GFX1250-SIZE-LABEL: 'fmul_bf16'
 ; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
 ; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
 ; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
   %bf16 = fmul bfloat undef, undef
   %v2bf16 = fmul <2 x bfloat> undef, undef
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
index 61929a64244d6..37f947d9c6341 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
@@ -6,7 +6,7 @@
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16-SIZE,GFX90A-FASTF64-SIZE %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900  -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,FASTF16-SIZE %s
 ; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,SLOWF64-SIZE %s
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s
+; RUN  opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s
 ; END.
 
 define amdgpu_kernel void @fsub_f32() #0 {
@@ -162,23 +162,23 @@ define amdgpu_kernel void @fsub_f16() #0 {
 
 define amdgpu_kernel void @fsub_bf16() #0 {
 ; GFX1250-LABEL: 'fsub_bf16'
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = fsub bfloat undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2bf16 = fsub <2 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = fsub <3 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4bf16 = fsub <4 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v5bf16 = fsub <5 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %v16bf16 = fsub <16 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v17bf16 = fsub <17 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fsub bfloat undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fsub <2 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fsub <3 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fsub <4 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fsub <5 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fsub <16 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fsub <17 x bfloat> undef, undef
 ; GFX1250-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX1250-SIZE-LABEL: 'fsub_bf16'
 ; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fsub bfloat undef, undef
 ; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fsub <2 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = fsub <3 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = fsub <4 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = fsub <5 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = fsub <16 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17bf16 = fsub <17 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fsub <3 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fsub <4 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fsub <5 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fsub <16 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fsub <17 x bfloat> undef, undef
 ; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %bf16 = fsub bfloat undef, undef

From 0b3633928018ffc4a0813150d5b61de197ca0b5c Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@amd.com>
Date: Thu, 25 Sep 2025 00:31:01 +0100
Subject: [PATCH 4/4] Latest fixes

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp   |   1 -
 llvm/test/Analysis/CostModel/AMDGPU/fadd.ll | 242 ++++++++--------
 llvm/test/Analysis/CostModel/AMDGPU/fma.ll  | 240 ++++++++--------
 llvm/test/Analysis/CostModel/AMDGPU/fmul.ll | 298 ++++++++++----------
 llvm/test/Analysis/CostModel/AMDGPU/fsub.ll | 242 ++++++++--------
 llvm/test/CodeGen/AMDGPU/bf16.ll            | 115 ++++++--
 6 files changed, 606 insertions(+), 532 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 77f0c55100981..afefd01ffb3ba 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6688,7 +6688,6 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
                                                SelectionDAG &DAG) const {
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
-  VT.dump();
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
          VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
          VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
index 3538c07938530..9b1495b35a89d 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
@@ -11,181 +11,181 @@
 
 define amdgpu_kernel void @fadd_f32() #0 {
 ; GFX90A-FASTF64-LABEL: 'fadd_f32'
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fadd <2 x float> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fadd <3 x float> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> undef, undef
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fadd <2 x float> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fadd <3 x float> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> poison, poison
 ; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; NOPACKEDF32-LABEL: 'fadd_f32'
-; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float undef, undef
-; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fadd <2 x float> undef, undef
-; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fadd <3 x float> undef, undef
-; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> undef, undef
-; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> undef, undef
-; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> undef, undef
-; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> undef, undef
+; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float poison, poison
+; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fadd <2 x float> poison, poison
+; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fadd <3 x float> poison, poison
+; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> poison, poison
+; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> poison, poison
+; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> poison, poison
+; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> poison, poison
 ; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX90A-FASTF64-SIZE-LABEL: 'fadd_f32'
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fadd <2 x float> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fadd <3 x float> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> undef, undef
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fadd <2 x float> poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fadd <3 x float> poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> poison, poison
 ; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; NOPACKEDF32-SIZE-LABEL: 'fadd_f32'
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fadd <2 x float> undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fadd <3 x float> undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> undef, undef
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fadd <2 x float> poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fadd <3 x float> poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> poison, poison
 ; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f32 = fadd float undef, undef
-  %v2f32 = fadd <2 x float> undef, undef
-  %v3f32 = fadd <3 x float> undef, undef
-  %v4f32 = fadd <4 x float> undef, undef
-  %v5f32 = fadd <5 x float> undef, undef
-  %v8f32 = fadd <8 x float> undef, undef
-  %v9f32 = fadd <9 x float> undef, undef
+  %f32 = fadd float poison, poison
+  %v2f32 = fadd <2 x float> poison, poison
+  %v3f32 = fadd <3 x float> poison, poison
+  %v4f32 = fadd <4 x float> poison, poison
+  %v5f32 = fadd <5 x float> poison, poison
+  %v8f32 = fadd <8 x float> poison, poison
+  %v9f32 = fadd <9 x float> poison, poison
   ret void
 }
 
 define amdgpu_kernel void @fadd_f64() #0 {
 ; GFX90A-FASTF64-LABEL: 'fadd_f64'
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = fadd double undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fadd <2 x double> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fadd <3 x double> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fadd <4 x double> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fadd <5 x double> undef, undef
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = fadd double poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fadd <2 x double> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fadd <3 x double> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fadd <4 x double> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fadd <5 x double> poison, poison
 ; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; FASTF64-LABEL: 'fadd_f64'
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = fadd double undef, undef
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fadd <2 x double> undef, undef
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fadd <3 x double> undef, undef
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fadd <4 x double> undef, undef
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fadd <5 x double> undef, undef
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = fadd double poison, poison
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fadd <2 x double> poison, poison
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fadd <3 x double> poison, poison
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fadd <4 x double> poison, poison
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fadd <5 x double> poison, poison
 ; FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOWF64-LABEL: 'fadd_f64'
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = fadd double undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fadd <2 x double> undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fadd <3 x double> undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fadd <4 x double> undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fadd <5 x double> undef, undef
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = fadd double poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fadd <2 x double> poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fadd <3 x double> poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fadd <4 x double> poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fadd <5 x double> poison, poison
 ; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX90A-FASTF64-SIZE-LABEL: 'fadd_f64'
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = fadd double undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fadd <2 x double> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fadd <3 x double> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fadd <4 x double> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fadd <5 x double> undef, undef
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = fadd double poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fadd <2 x double> poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fadd <3 x double> poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fadd <4 x double> poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fadd <5 x double> poison, poison
 ; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; NOPACKEDF32-SIZE-LABEL: 'fadd_f64'
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = fadd double undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fadd <2 x double> undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fadd <3 x double> undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fadd <4 x double> undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fadd <5 x double> undef, undef
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = fadd double poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fadd <2 x double> poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fadd <3 x double> poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fadd <4 x double> poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fadd <5 x double> poison, poison
 ; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f64 = fadd double undef, undef
-  %v2f64 = fadd <2 x double> undef, undef
-  %v3f64 = fadd <3 x double> undef, undef
-  %v4f64 = fadd <4 x double> undef, undef
-  %v5f64 = fadd <5 x double> undef, undef
+  %f64 = fadd double poison, poison
+  %v2f64 = fadd <2 x double> poison, poison
+  %v3f64 = fadd <3 x double> poison, poison
+  %v4f64 = fadd <4 x double> poison, poison
+  %v5f64 = fadd <5 x double> poison, poison
   ret void
 }
 
 define amdgpu_kernel void @fadd_f16() #0 {
 ; FASTF16-LABEL: 'fadd_f16'
-; FASTF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef
-; FASTF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> undef, undef
-; FASTF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> undef, undef
-; FASTF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef
-; FASTF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef
-; FASTF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef
-; FASTF16-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> undef, undef
+; FASTF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half poison, poison
+; FASTF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> poison, poison
+; FASTF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> poison, poison
+; FASTF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> poison, poison
+; FASTF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> poison, poison
+; FASTF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> poison, poison
+; FASTF16-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> poison, poison
 ; FASTF16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOWF64-LABEL: 'fadd_f16'
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fadd <16 x half> undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fadd <17 x half> undef, undef
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fadd <16 x half> poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fadd <17 x half> poison, poison
 ; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; FASTF16-SIZE-LABEL: 'fadd_f16'
-; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef
-; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> undef, undef
-; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> undef, undef
-; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef
-; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef
-; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef
-; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> undef, undef
+; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half poison, poison
+; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> poison, poison
+; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> poison, poison
+; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> poison, poison
+; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> poison, poison
+; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> poison, poison
+; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> poison, poison
 ; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLOWF64-SIZE-LABEL: 'fadd_f16'
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> undef, undef
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> undef, undef
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> undef, undef
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> undef, undef
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fadd <16 x half> undef, undef
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fadd <17 x half> undef, undef
+; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half poison, poison
+; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> poison, poison
+; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> poison, poison
+; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> poison, poison
+; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> poison, poison
+; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fadd <16 x half> poison, poison
+; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fadd <17 x half> poison, poison
 ; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f16 = fadd half undef, undef
-  %v2f16 = fadd <2 x half> undef, undef
-  %v3f16 = fadd <3 x half> undef, undef
-  %v4f16 = fadd <4 x half> undef, undef
-  %v5f16 = fadd <5 x half> undef, undef
-  %v16f16 = fadd <16 x half> undef, undef
-  %v17f16 = fadd <17 x half> undef, undef
+  %f16 = fadd half poison, poison
+  %v2f16 = fadd <2 x half> poison, poison
+  %v3f16 = fadd <3 x half> poison, poison
+  %v4f16 = fadd <4 x half> poison, poison
+  %v5f16 = fadd <5 x half> poison, poison
+  %v16f16 = fadd <16 x half> poison, poison
+  %v17f16 = fadd <17 x half> poison, poison
   ret void
 }
 
 define amdgpu_kernel void @fadd_bf16() #0 {
 ; GFX1250-LABEL: 'fadd_bf16'
-; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %bf16 = fadd bfloat undef, undef
-; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v2bf16 = fadd <2 x bfloat> undef, undef
-; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v3bf16 = fadd <3 x bfloat> undef, undef
-; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v4bf16 = fadd <4 x bfloat> undef, undef
-; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v5bf16 = fadd <5 x bfloat> undef, undef
-; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction:   %v16bf16 = fadd <16 x bfloat> undef, undef
-; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction:   %v17bf16 = fadd <17 x bfloat> undef, undef
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %bf16 = fadd bfloat poison, poison
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v2bf16 = fadd <2 x bfloat> poison, poison
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v3bf16 = fadd <3 x bfloat> poison, poison
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v4bf16 = fadd <4 x bfloat> poison, poison
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v5bf16 = fadd <5 x bfloat> poison, poison
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction:   %v16bf16 = fadd <16 x bfloat> poison, poison
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction:   %v17bf16 = fadd <17 x bfloat> poison, poison
 ; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction:   ret void
 ; GFX1250-SIZE-LABEL: 'fadd_bf16'
-; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %bf16 = fadd bfloat undef, undef
-; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v2bf16 = fadd <2 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v3bf16 = fadd <3 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v4bf16 = fadd <4 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v5bf16 = fadd <5 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction:   %v16bf16 = fadd <16 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction:   %v17bf16 = fadd <17 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %bf16 = fadd bfloat poison, poison
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v2bf16 = fadd <2 x bfloat> poison, poison
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v3bf16 = fadd <3 x bfloat> poison, poison
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v4bf16 = fadd <4 x bfloat> poison, poison
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v5bf16 = fadd <5 x bfloat> poison, poison
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction:   %v16bf16 = fadd <16 x bfloat> poison, poison
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction:   %v17bf16 = fadd <17 x bfloat> poison, poison
 ; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   ret void
-  %bf16 = fadd bfloat undef, undef
-  %v2bf16 = fadd <2 x bfloat> undef, undef
-  %v3bf16 = fadd <3 x bfloat> undef, undef
-  %v4bf16 = fadd <4 x bfloat> undef, undef
-  %v5bf16 = fadd <5 x bfloat> undef, undef
-  %v16bf16 = fadd <16 x bfloat> undef, undef
-  %v17bf16 = fadd <17 x bfloat> undef, undef
+  %bf16 = fadd bfloat poison, poison
+  %v2bf16 = fadd <2 x bfloat> poison, poison
+  %v3bf16 = fadd <3 x bfloat> poison, poison
+  %v4bf16 = fadd <4 x bfloat> poison, poison
+  %v5bf16 = fadd <5 x bfloat> poison, poison
+  %v16bf16 = fadd <16 x bfloat> poison, poison
+  %v17bf16 = fadd <17 x bfloat> poison, poison
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
index db2170af2c801..f34ee31bcf4ce 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
@@ -12,176 +12,176 @@
 
 define void @fma_f16() {
 ; FAST-LABEL: 'fma_f16'
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half poison, half poison, half poison)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOW-LABEL: 'fma_f16'
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f16 = call half @llvm.fma.f16(half poison, half poison, half poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; FAST-SIZE-LABEL: 'fma_f16'
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half poison, half poison, half poison)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLOW-SIZE-LABEL: 'fma_f16'
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half poison, half poison, half poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f16 = call half @llvm.fma.f16(half undef, half undef, half undef)
-  %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef)
-  %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef)
-  %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef)
-  %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef)
-  %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef)
-  %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef)
+  %f16 = call half @llvm.fma.f16(half poison, half poison, half poison)
+  %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison)
+  %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison)
+  %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison)
+  %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison)
+  %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison)
+  %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison)
   ret void
 }
 
 define void @fma_bf16() {
 ; FAST-LABEL: 'fma_bf16'
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOW-LABEL: 'fma_bf16'
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; FAST-SIZE-LABEL: 'fma_bf16'
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLOW-SIZE-LABEL: 'fma_bf16'
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX1250-LABEL: 'fma_bf16'
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison)
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison)
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison)
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison)
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison)
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison)
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison)
 ; GFX1250-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ; GFX1250-SIZE-LABEL: 'fma_bf16'
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison)
 ; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-  %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
-  %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
-  %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef)
-  %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
-  %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef)
-  %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
-  %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef)
+  %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison)
+  %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison)
+  %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison)
+  %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison)
+  %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison)
+  %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison)
+  %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison)
   ret void
 }
 
 define void @fma_f32() {
 ; SLOW-LABEL: 'fma_f32'
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fma.f32(float poison, float poison, float poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> poison, <2 x float> poison, <2 x float> poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> poison, <3 x float> poison, <3 x float> poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> poison, <4 x float> poison, <4 x float> poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> poison, <5 x float> poison, <5 x float> poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> poison, <8 x float> poison, <8 x float> poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> poison, <9 x float> poison, <9 x float> poison)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOW-SIZE-LABEL: 'fma_f32'
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float poison, float poison, float poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> poison, <2 x float> poison, <2 x float> poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> poison, <3 x float> poison, <3 x float> poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> poison, <4 x float> poison, <4 x float> poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> poison, <5 x float> poison, <5 x float> poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> poison, <8 x float> poison, <8 x float> poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> poison, <9 x float> poison, <9 x float> poison)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f32 = call float @llvm.fma.f32(float undef, float undef, float undef)
-  %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
-  %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef)
-  %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-  %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef)
-  %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-  %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef)
+  %f32 = call float @llvm.fma.f32(float poison, float poison, float poison)
+  %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> poison, <2 x float> poison, <2 x float> poison)
+  %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> poison, <3 x float> poison, <3 x float> poison)
+  %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> poison, <4 x float> poison, <4 x float> poison)
+  %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> poison, <5 x float> poison, <5 x float> poison)
+  %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> poison, <8 x float> poison, <8 x float> poison)
+  %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> poison, <9 x float> poison, <9 x float> poison)
   ret void
 }
 
 define void @fma_f64() {
 ; SLOW-LABEL: 'fma_f64'
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.fma.f64(double poison, double poison, double poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> poison, <2 x double> poison, <2 x double> poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> poison, <3 x double> poison, <3 x double> poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> poison, <4 x double> poison, <4 x double> poison)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> poison, <5 x double> poison, <5 x double> poison)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOW-SIZE-LABEL: 'fma_f64'
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double poison, double poison, double poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> poison, <2 x double> poison, <2 x double> poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> poison, <3 x double> poison, <3 x double> poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> poison, <4 x double> poison, <4 x double> poison)
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> poison, <5 x double> poison, <5 x double> poison)
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f64 = call double @llvm.fma.f64(double undef, double undef, double undef)
-  %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-  %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef)
-  %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
-  %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef)
+  %f64 = call double @llvm.fma.f64(double poison, double poison, double poison)
+  %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> poison, <2 x double> poison, <2 x double> poison)
+  %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> poison, <3 x double> poison, <3 x double> poison)
+  %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> poison, <4 x double> poison, <4 x double> poison)
+  %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> poison, <5 x double> poison, <5 x double> poison)
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
index c022046fbf1df..c0b9cda23ea04 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
@@ -11,222 +11,222 @@
 
 define amdgpu_kernel void @fmul_f32() #0 {
 ; GFX90A-FASTF64-LABEL: 'fmul_f32'
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fmul <2 x float> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fmul <3 x float> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> undef, undef
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fmul <2 x float> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fmul <3 x float> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> poison, poison
 ; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; F32-LABEL: 'fmul_f32'
-; F32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float undef, undef
-; F32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fmul <2 x float> undef, undef
-; F32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fmul <3 x float> undef, undef
-; F32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> undef, undef
-; F32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> undef, undef
-; F32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> undef, undef
-; F32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> undef, undef
+; F32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float poison, poison
+; F32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fmul <2 x float> poison, poison
+; F32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fmul <3 x float> poison, poison
+; F32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> poison, poison
+; F32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> poison, poison
+; F32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> poison, poison
+; F32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> poison, poison
 ; F32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX90A-SIZE-LABEL: 'fmul_f32'
-; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float undef, undef
-; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fmul <2 x float> undef, undef
-; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fmul <3 x float> undef, undef
-; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> undef, undef
-; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> undef, undef
-; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> undef, undef
-; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> undef, undef
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float poison, poison
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fmul <2 x float> poison, poison
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fmul <3 x float> poison, poison
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> poison, poison
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> poison, poison
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> poison, poison
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> poison, poison
 ; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'fmul_f32'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fmul <2 x float> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fmul <3 x float> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> undef, undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float poison, poison
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fmul <2 x float> poison, poison
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fmul <3 x float> poison, poison
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> poison, poison
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> poison, poison
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> poison, poison
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> poison, poison
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f32 = fmul float undef, undef
-  %v2f32 = fmul <2 x float> undef, undef
-  %v3f32 = fmul <3 x float> undef, undef
-  %v4f32 = fmul <4 x float> undef, undef
-  %v5f32 = fmul <5 x float> undef, undef
-  %v8f32 = fmul <8 x float> undef, undef
-  %v9f32 = fmul <9 x float> undef, undef
+  %f32 = fmul float poison, poison
+  %v2f32 = fmul <2 x float> poison, poison
+  %v3f32 = fmul <3 x float> poison, poison
+  %v4f32 = fmul <4 x float> poison, poison
+  %v5f32 = fmul <5 x float> poison, poison
+  %v8f32 = fmul <8 x float> poison, poison
+  %v9f32 = fmul <9 x float> poison, poison
   ret void
 }
 
 define amdgpu_kernel void @fmul_f64() #0 {
 ; GFX90A-FASTF64-LABEL: 'fmul_f64'
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = fmul double undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fmul <2 x double> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fmul <3 x double> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fmul <4 x double> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fmul <5 x double> undef, undef
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = fmul double poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fmul <2 x double> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fmul <3 x double> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fmul <4 x double> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fmul <5 x double> poison, poison
 ; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; FASTF64-LABEL: 'fmul_f64'
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = fmul double undef, undef
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fmul <2 x double> undef, undef
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fmul <3 x double> undef, undef
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fmul <4 x double> undef, undef
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fmul <5 x double> undef, undef
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = fmul double poison, poison
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fmul <2 x double> poison, poison
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fmul <3 x double> poison, poison
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fmul <4 x double> poison, poison
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fmul <5 x double> poison, poison
 ; FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOW-LABEL: 'fmul_f64'
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = fmul double undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fmul <2 x double> undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fmul <3 x double> undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fmul <4 x double> undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fmul <5 x double> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = fmul double poison, poison
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fmul <2 x double> poison, poison
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fmul <3 x double> poison, poison
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fmul <4 x double> poison, poison
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fmul <5 x double> poison, poison
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX90A-SIZE-LABEL: 'fmul_f64'
-; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = fmul double undef, undef
-; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fmul <2 x double> undef, undef
-; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fmul <3 x double> undef, undef
-; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fmul <4 x double> undef, undef
-; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fmul <5 x double> undef, undef
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = fmul double poison, poison
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fmul <2 x double> poison, poison
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fmul <3 x double> poison, poison
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fmul <4 x double> poison, poison
+; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fmul <5 x double> poison, poison
 ; GFX90A-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'fmul_f64'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = fmul double undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fmul <2 x double> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fmul <3 x double> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fmul <4 x double> undef, undef
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fmul <5 x double> undef, undef
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = fmul double poison, poison
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fmul <2 x double> poison, poison
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fmul <3 x double> poison, poison
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fmul <4 x double> poison, poison
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fmul <5 x double> poison, poison
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f64 = fmul double undef, undef
-  %v2f64 = fmul <2 x double> undef, undef
-  %v3f64 = fmul <3 x double> undef, undef
-  %v4f64 = fmul <4 x double> undef, undef
-  %v5f64 = fmul <5 x double> undef, undef
+  %f64 = fmul double poison, poison
+  %v2f64 = fmul <2 x double> poison, poison
+  %v3f64 = fmul <3 x double> poison, poison
+  %v4f64 = fmul <4 x double> poison, poison
+  %v5f64 = fmul <5 x double> poison, poison
   ret void
 }
 
 define amdgpu_kernel void @fmul_f16() #0 {
 ; GFX9-LABEL: 'fmul_f16'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half poison, poison
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> poison, poison
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> poison, poison
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> poison, poison
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> poison, poison
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> poison, poison
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> poison, poison
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOW-LABEL: 'fmul_f16'
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fmul <16 x half> undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fmul <17 x half> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half poison, poison
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> poison, poison
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> poison, poison
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> poison, poison
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> poison, poison
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fmul <16 x half> poison, poison
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fmul <17 x half> poison, poison
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'fmul_f16'
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half poison, poison
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> poison, poison
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> poison, poison
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> poison, poison
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> poison, poison
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> poison, poison
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> poison, poison
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLOW-SIZE-LABEL: 'fmul_f16'
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> undef, undef
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> undef, undef
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> undef, undef
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> undef, undef
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fmul <16 x half> undef, undef
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fmul <17 x half> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half poison, poison
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> poison, poison
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> poison, poison
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> poison, poison
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> poison, poison
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fmul <16 x half> poison, poison
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fmul <17 x half> poison, poison
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f16 = fmul half undef, undef
-  %v2f16 = fmul <2 x half> undef, undef
-  %v3f16 = fmul <3 x half> undef, undef
-  %v4f16 = fmul <4 x half> undef, undef
-  %v5f16 = fmul <5 x half> undef, undef
-  %v16f16 = fmul <16 x half> undef, undef
-  %v17f16 = fmul <17 x half> undef, undef
+  %f16 = fmul half poison, poison
+  %v2f16 = fmul <2 x half> poison, poison
+  %v3f16 = fmul <3 x half> poison, poison
+  %v4f16 = fmul <4 x half> poison, poison
+  %v5f16 = fmul <5 x half> poison, poison
+  %v16f16 = fmul <16 x half> poison, poison
+  %v17f16 = fmul <17 x half> poison, poison
   ret void
 }
 
 define amdgpu_kernel void @fmul_bf16() #0 {
 ; GFX9-LABEL: 'fmul_bf16'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOW-LABEL: 'fmul_bf16'
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX1250-LABEL: 'fmul_bf16'
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison
 ; GFX1250-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'fmul_bf16'
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLOW-SIZE-LABEL: 'fmul_bf16'
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
-; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison
+; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison
 ; SLOW-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX1250-SIZE-LABEL: 'fmul_bf16'
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison
 ; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-  %bf16 = fmul bfloat undef, undef
-  %v2bf16 = fmul <2 x bfloat> undef, undef
-  %v3bf16 = fmul <3 x bfloat> undef, undef
-  %v4bf16 = fmul <4 x bfloat> undef, undef
-  %v5bf16 = fmul <5 x bfloat> undef, undef
-  %v16bf16 = fmul <16 x bfloat> undef, undef
-  %v17bf16 = fmul <17 x bfloat> undef, undef
+  %bf16 = fmul bfloat poison, poison
+  %v2bf16 = fmul <2 x bfloat> poison, poison
+  %v3bf16 = fmul <3 x bfloat> poison, poison
+  %v4bf16 = fmul <4 x bfloat> poison, poison
+  %v5bf16 = fmul <5 x bfloat> poison, poison
+  %v16bf16 = fmul <16 x bfloat> poison, poison
+  %v17bf16 = fmul <17 x bfloat> poison, poison
   ret void
 }
 
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
index 37f947d9c6341..6b71603f70f6b 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
@@ -11,182 +11,182 @@
 
 define amdgpu_kernel void @fsub_f32() #0 {
 ; GFX90A-FASTF64-LABEL: 'fsub_f32'
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fsub <2 x float> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fsub <3 x float> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> undef, undef
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fsub <2 x float> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fsub <3 x float> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> poison, poison
 ; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; NOPACKEDF32-LABEL: 'fsub_f32'
-; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float undef, undef
-; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fsub <2 x float> undef, undef
-; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fsub <3 x float> undef, undef
-; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> undef, undef
-; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> undef, undef
-; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> undef, undef
-; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> undef, undef
+; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float poison, poison
+; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fsub <2 x float> poison, poison
+; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fsub <3 x float> poison, poison
+; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> poison, poison
+; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> poison, poison
+; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> poison, poison
+; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> poison, poison
 ; NOPACKEDF32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX90A-FASTF64-SIZE-LABEL: 'fsub_f32'
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fsub <2 x float> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fsub <3 x float> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> undef, undef
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fsub <2 x float> poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fsub <3 x float> poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> poison, poison
 ; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; NOPACKEDF32-SIZE-LABEL: 'fsub_f32'
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fsub <2 x float> undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fsub <3 x float> undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> undef, undef
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fsub <2 x float> poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fsub <3 x float> poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> poison, poison
 ; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f32 = fsub float undef, undef
-  %v2f32 = fsub <2 x float> undef, undef
-  %v3f32 = fsub <3 x float> undef, undef
-  %v4f32 = fsub <4 x float> undef, undef
-  %v5f32 = fsub <5 x float> undef, undef
-  %v8f32 = fsub <8 x float> undef, undef
-  %v9f32 = fsub <9 x float> undef, undef
+  %f32 = fsub float poison, poison
+  %v2f32 = fsub <2 x float> poison, poison
+  %v3f32 = fsub <3 x float> poison, poison
+  %v4f32 = fsub <4 x float> poison, poison
+  %v5f32 = fsub <5 x float> poison, poison
+  %v8f32 = fsub <8 x float> poison, poison
+  %v9f32 = fsub <9 x float> poison, poison
   ret void
 }
 
 define amdgpu_kernel void @fsub_f64() #0 {
 ; GFX90A-FASTF64-LABEL: 'fsub_f64'
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = fsub double undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fsub <2 x double> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fsub <3 x double> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fsub <4 x double> undef, undef
-; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fsub <5 x double> undef, undef
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = fsub double poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fsub <2 x double> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fsub <3 x double> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fsub <4 x double> poison, poison
+; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fsub <5 x double> poison, poison
 ; GFX90A-FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; FASTF64-LABEL: 'fsub_f64'
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = fsub double undef, undef
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fsub <2 x double> undef, undef
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fsub <3 x double> undef, undef
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fsub <4 x double> undef, undef
-; FASTF64-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fsub <5 x double> undef, undef
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = fsub double poison, poison
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fsub <2 x double> poison, poison
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fsub <3 x double> poison, poison
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fsub <4 x double> poison, poison
+; FASTF64-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fsub <5 x double> poison, poison
 ; FASTF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOWF64-LABEL: 'fsub_f64'
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = fsub double undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fsub <2 x double> undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fsub <3 x double> undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fsub <4 x double> undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fsub <5 x double> undef, undef
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = fsub double poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fsub <2 x double> poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fsub <3 x double> poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fsub <4 x double> poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fsub <5 x double> poison, poison
 ; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX90A-FASTF64-SIZE-LABEL: 'fsub_f64'
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = fsub double undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fsub <2 x double> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fsub <3 x double> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fsub <4 x double> undef, undef
-; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fsub <5 x double> undef, undef
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = fsub double poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fsub <2 x double> poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fsub <3 x double> poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fsub <4 x double> poison, poison
+; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fsub <5 x double> poison, poison
 ; GFX90A-FASTF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; NOPACKEDF32-SIZE-LABEL: 'fsub_f64'
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = fsub double undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fsub <2 x double> undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fsub <3 x double> undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fsub <4 x double> undef, undef
-; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fsub <5 x double> undef, undef
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = fsub double poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fsub <2 x double> poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fsub <3 x double> poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fsub <4 x double> poison, poison
+; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fsub <5 x double> poison, poison
 ; NOPACKEDF32-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f64 = fsub double undef, undef
-  %v2f64 = fsub <2 x double> undef, undef
-  %v3f64 = fsub <3 x double> undef, undef
-  %v4f64 = fsub <4 x double> undef, undef
-  %v5f64 = fsub <5 x double> undef, undef
+  %f64 = fsub double poison, poison
+  %v2f64 = fsub <2 x double> poison, poison
+  %v3f64 = fsub <3 x double> poison, poison
+  %v4f64 = fsub <4 x double> poison, poison
+  %v5f64 = fsub <5 x double> poison, poison
   ret void
 }
 
 define amdgpu_kernel void @fsub_f16() #0 {
 ; FASTF16-LABEL: 'fsub_f16'
-; FASTF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef
-; FASTF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> undef, undef
-; FASTF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> undef, undef
-; FASTF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef
-; FASTF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef
-; FASTF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef
-; FASTF16-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> undef, undef
+; FASTF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half poison, poison
+; FASTF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> poison, poison
+; FASTF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> poison, poison
+; FASTF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> poison, poison
+; FASTF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> poison, poison
+; FASTF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> poison, poison
+; FASTF16-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> poison, poison
 ; FASTF16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOWF64-LABEL: 'fsub_f16'
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fsub <16 x half> undef, undef
-; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fsub <17 x half> undef, undef
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fsub <16 x half> poison, poison
+; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fsub <17 x half> poison, poison
 ; SLOWF64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; FASTF16-SIZE-LABEL: 'fsub_f16'
-; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef
-; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> undef, undef
-; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> undef, undef
-; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef
-; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef
-; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef
-; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> undef, undef
+; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half poison, poison
+; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> poison, poison
+; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> poison, poison
+; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> poison, poison
+; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> poison, poison
+; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> poison, poison
+; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> poison, poison
 ; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLOWF64-SIZE-LABEL: 'fsub_f16'
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> undef, undef
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> undef, undef
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> undef, undef
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> undef, undef
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fsub <16 x half> undef, undef
-; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fsub <17 x half> undef, undef
+; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half poison, poison
+; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> poison, poison
+; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> poison, poison
+; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> poison, poison
+; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> poison, poison
+; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fsub <16 x half> poison, poison
+; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fsub <17 x half> poison, poison
 ; SLOWF64-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %f16 = fsub half undef, undef
-  %v2f16 = fsub <2 x half> undef, undef
-  %v3f16 = fsub <3 x half> undef, undef
-  %v4f16 = fsub <4 x half> undef, undef
-  %v5f16 = fsub <5 x half> undef, undef
-  %v16f16 = fsub <16 x half> undef, undef
-  %v17f16 = fsub <17 x half> undef, undef
+  %f16 = fsub half poison, poison
+  %v2f16 = fsub <2 x half> poison, poison
+  %v3f16 = fsub <3 x half> poison, poison
+  %v4f16 = fsub <4 x half> poison, poison
+  %v5f16 = fsub <5 x half> poison, poison
+  %v16f16 = fsub <16 x half> poison, poison
+  %v17f16 = fsub <17 x half> poison, poison
   ret void
 }
 
 define amdgpu_kernel void @fsub_bf16() #0 {
 ; GFX1250-LABEL: 'fsub_bf16'
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fsub bfloat undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fsub <2 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fsub <3 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fsub <4 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fsub <5 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fsub <16 x bfloat> undef, undef
-; GFX1250-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fsub <17 x bfloat> undef, undef
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fsub bfloat poison, poison
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fsub <2 x bfloat> poison, poison
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fsub <3 x bfloat> poison, poison
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fsub <4 x bfloat> poison, poison
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fsub <5 x bfloat> poison, poison
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fsub <16 x bfloat> poison, poison
+; GFX1250-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fsub <17 x bfloat> poison, poison
 ; GFX1250-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX1250-SIZE-LABEL: 'fsub_bf16'
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fsub bfloat undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fsub <2 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fsub <3 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fsub <4 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fsub <5 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fsub <16 x bfloat> undef, undef
-; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fsub <17 x bfloat> undef, undef
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fsub bfloat poison, poison
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fsub <2 x bfloat> poison, poison
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fsub <3 x bfloat> poison, poison
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fsub <4 x bfloat> poison, poison
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fsub <5 x bfloat> poison, poison
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fsub <16 x bfloat> poison, poison
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fsub <17 x bfloat> poison, poison
 ; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %bf16 = fsub bfloat undef, undef
-  %v2bf16 = fsub <2 x bfloat> undef, undef
-  %v3bf16 = fsub <3 x bfloat> undef, undef
-  %v4bf16 = fsub <4 x bfloat> undef, undef
-  %v5bf16 = fsub <5 x bfloat> undef, undef
-  %v16bf16 = fsub <16 x bfloat> undef, undef
-  %v17bf16 = fsub <17 x bfloat> undef, undef
+  %bf16 = fsub bfloat poison, poison
+  %v2bf16 = fsub <2 x bfloat> poison, poison
+  %v3bf16 = fsub <3 x bfloat> poison, poison
+  %v4bf16 = fsub <4 x bfloat> poison, poison
+  %v5bf16 = fsub <5 x bfloat> poison, poison
+  %v16bf16 = fsub <16 x bfloat> poison, poison
+  %v17bf16 = fsub <17 x bfloat> poison, poison
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index e10141f9ba809..0490e5a19b4b7 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -7,14 +7,10 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16
-<<<<<<< HEAD
 ; xUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250TRUE16
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250FAKE16
 
 ; FIXME: real-true16 version of gfx1250 test fails
-=======
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 | FileCheck %s -check-prefixes=GFX1250
->>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops)
 
 define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_load_store:
@@ -49042,6 +49038,9 @@ declare bfloat @llvm.fma.bf16(bfloat, bfloat, bfloat)
 declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
 declare <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)
 declare <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>)
+declare <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>)
+declare <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat>, <16 x bfloat>, <16 x bfloat>)
+declare <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat>, <32 x bfloat>, <32 x bfloat>)
 
 define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GCN-LABEL: v_fma_bf16:
@@ -49363,10 +49362,7 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
-<<<<<<< HEAD
 ;
-=======
->>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops)
 ; GFX1250-LABEL: v_fma_v2bf16:
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -49641,7 +49637,6 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11FAKE16-NEXT:    v_alignbit_b32 v1, s0, v3, 16
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
-<<<<<<< HEAD
 ;
 ; GFX1250-LABEL: v_fma_v3bf16:
 ; GFX1250:       ; %bb.0:
@@ -49650,15 +49645,6 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX1250-NEXT:    v_pk_fma_bf16 v0, v0, v2, v4
 ; GFX1250-NEXT:    v_pk_fma_bf16 v1, v1, v3, v5
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
-=======
-; GFX1250-LABEL:     v_fma_v3bf16:
-; GFX1250:           %bb.0:
-; GFX1250-NEXT:        s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:        s_wait_kmcnt 0x0
-; GFX1250-NEXT:        v_pk_fma_bf16 v0, v0, v2, v4
-; GFX1250-NEXT:        v_pk_fma_bf16 v1, v1, v3, v5
-; GFX1250-NEXT:        s_set_pc_i64 s[30:31]
->>>>>>> cc3762e87c75 (Add testing coverage - part I)
   %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
   ret <3 x bfloat> %op
 }
@@ -49993,10 +49979,7 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
-<<<<<<< HEAD
 ;
-=======
->>>>>>> b01cd5e2411a ([AMDGPU] Fix vector legalization for bf16 valu ops)
 ; GFX1250-LABEL: v_fma_v4bf16:
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -50008,6 +49991,98 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
   ret <4 x bfloat> %op
 }
 
+; GFX1250-LABEL: v_fma_v8bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_pk_fma_bf16 v0, v0, v4, v8
+; GFX1250-NEXT:    v_pk_fma_bf16 v1, v1, v5, v9
+; GFX1250-NEXT:    v_pk_fma_bf16 v2, v2, v6, v10
+; GFX1250-NEXT:    v_pk_fma_bf16 v3, v3, v7, v11
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
+  %op = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c)
+  ret <8 x bfloat> %op
+}
+
+; GFX1250-LABEL: v_fma_v16bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_pk_fma_bf16 v0, v0, v8, v16
+; GFX1250-NEXT:    v_pk_fma_bf16 v1, v1, v9, v17
+; GFX1250-NEXT:    v_pk_fma_bf16 v2, v2, v10, v18
+; GFX1250-NEXT:    v_pk_fma_bf16 v3, v3, v11, v19
+; GFX1250-NEXT:    v_pk_fma_bf16 v4, v4, v12, v20
+; GFX1250-NEXT:    v_pk_fma_bf16 v5, v5, v13, v21
+; GFX1250-NEXT:    v_pk_fma_bf16 v6, v6, v14, v22
+; GFX1250-NEXT:    v_pk_fma_bf16 v7, v7, v15, v23
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c) {
+  %op = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c)
+  ret <16 x bfloat> %op
+}
+
+; GFX1250-LABEL: v_fma_v32bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:     s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:     s_wait_kmcnt 0x0
+; GFX1250-NEXT:     s_clause 0x10
+; GFX1250-NEXT:     scratch_load_b32 v31, off, s32 offset:64
+; GFX1250-NEXT:     scratch_load_b32 v32, off, s32 offset:4
+; GFX1250-NEXT:     scratch_load_b32 v33, off, s32 offset:8
+; GFX1250-NEXT:     scratch_load_b32 v34, off, s32 offset:12
+; GFX1250-NEXT:     scratch_load_b32 v35, off, s32 offset:16
+; GFX1250-NEXT:     scratch_load_b32 v36, off, s32 offset:20
+; GFX1250-NEXT:     scratch_load_b32 v37, off, s32 offset:24
+; GFX1250-NEXT:     scratch_load_b32 v38, off, s32 offset:28
+; GFX1250-NEXT:     scratch_load_b32 v39, off, s32 offset:32
+; GFX1250-NEXT:     scratch_load_b32 v48, off, s32 offset:36
+; GFX1250-NEXT:     scratch_load_b32 v49, off, s32 offset:40
+; GFX1250-NEXT:     scratch_load_b32 v50, off, s32 offset:44
+; GFX1250-NEXT:     scratch_load_b32 v51, off, s32 offset:48
+; GFX1250-NEXT:     scratch_load_b32 v52, off, s32 offset:52
+; GFX1250-NEXT:     scratch_load_b32 v53, off, s32 offset:56
+; GFX1250-NEXT:     scratch_load_b32 v54, off, s32 offset:60
+; GFX1250-NEXT:     scratch_load_b32 v55, off, s32
+; GFX1250-NEXT:     s_wait_loadcnt 0xf
+; GFX1250-NEXT:     v_pk_fma_bf16 v0, v0, v16, v32
+; GFX1250-NEXT:     s_wait_loadcnt 0xe
+; GFX1250-NEXT:     v_pk_fma_bf16 v1, v1, v17, v33
+; GFX1250-NEXT:     s_wait_loadcnt 0xd
+; GFX1250-NEXT:     v_pk_fma_bf16 v2, v2, v18, v34
+; GFX1250-NEXT:     s_wait_loadcnt 0xc
+; GFX1250-NEXT:     v_pk_fma_bf16 v3, v3, v19, v35
+; GFX1250-NEXT:     s_wait_loadcnt 0xb
+; GFX1250-NEXT:     v_pk_fma_bf16 v4, v4, v20, v36
+; GFX1250-NEXT:     s_wait_loadcnt 0xa
+; GFX1250-NEXT:     v_pk_fma_bf16 v5, v5, v21, v37
+; GFX1250-NEXT:     s_wait_loadcnt 0x9
+; GFX1250-NEXT:     v_pk_fma_bf16 v6, v6, v22, v38
+; GFX1250-NEXT:     s_wait_loadcnt 0x8
+; GFX1250-NEXT:     v_pk_fma_bf16 v7, v7, v23, v39
+; GFX1250-NEXT:     s_wait_loadcnt 0x7
+; GFX1250-NEXT:     v_pk_fma_bf16 v8, v8, v24, v48
+; GFX1250-NEXT:     s_wait_loadcnt 0x6
+; GFX1250-NEXT:     v_pk_fma_bf16 v9, v9, v25, v49
+; GFX1250-NEXT:     s_wait_loadcnt 0x5
+; GFX1250-NEXT:     v_pk_fma_bf16 v10, v10, v26, v50
+; GFX1250-NEXT:     s_wait_loadcnt 0x4
+; GFX1250-NEXT:     v_pk_fma_bf16 v11, v11, v27, v51
+; GFX1250-NEXT:     s_wait_loadcnt 0x3
+; GFX1250-NEXT:     v_pk_fma_bf16 v12, v12, v28, v52
+; GFX1250-NEXT:     s_wait_loadcnt 0x2
+; GFX1250-NEXT:     v_pk_fma_bf16 v13, v13, v29, v53
+; GFX1250-NEXT:     s_wait_loadcnt 0x1
+; GFX1250-NEXT:     v_pk_fma_bf16 v14, v14, v30, v54
+; GFX1250-NEXT:     s_wait_loadcnt 0x0
+; GFX1250-NEXT:     v_pk_fma_bf16 v15, v15, v55, v31
+; GFX1250-NEXT:     s_set_pc_i64 s[30:31]
+define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c) {
+  %op = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c)
+  ret <32 x bfloat> %op
+}
+
 declare bfloat @llvm.fmuladd.bf16(bfloat, bfloat, bfloat)
 declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>)
 declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>)