From d98c99441f052f9850a96048a598018d83035115 Mon Sep 17 00:00:00 2001
From: Paul Bowen-Huggett <paulhuggett@mac.com>
Date: Mon, 6 Jan 2025 09:50:46 +0100
Subject: [PATCH 01/49] [NFC] Make AMDGPUCombinerHelper methods const

This is a follow-up to a previous commit (ee7ca0d) which eliminated
several "TODO: make CombinerHelper methods const" remarks. As promised
in that ealier commit, this change completes the set by also making
the methods of AMDGPUCombinerHelper const so that the Helper member of
AMDGPUPreLegalizerCombinerImpl can be const rather than explicitly
mutable.
---
 llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp       | 8 ++++----
 llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h         | 8 ++++----
 llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp | 3 +--
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index e5a376ab7357c..6fa8117004899 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -190,7 +190,7 @@ static unsigned inverseMinMax(unsigned Opc) {
 }
 
 bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
-                                             MachineInstr *&MatchInfo) {
+                                             MachineInstr *&MatchInfo) const {
   Register Src = MI.getOperand(1).getReg();
   MatchInfo = MRI.getVRegDef(Src);
 
@@ -259,7 +259,7 @@ bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
 }
 
 void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
-                                             MachineInstr *&MatchInfo) {
+                                             MachineInstr *&MatchInfo) const {
   // Transform:
   // %A = inst %Op1, ...
   // %B = fneg %A
@@ -418,7 +418,7 @@ static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI,
 bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI,
                                                        Register Src0,
                                                        Register Src1,
-                                                       Register Src2) {
+                                                       Register Src2) const {
   assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC);
   Register SrcReg = MI.getOperand(1).getReg();
   if (!MRI.hasOneNonDBGUse(SrcReg) || MRI.getType(SrcReg) != LLT::scalar(32))
@@ -431,7 +431,7 @@ bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI,
 void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI,
                                                        Register Src0,
                                                        Register Src1,
-                                                       Register Src2) {
+                                                       Register Src2) const {
   // We expect fptrunc (fpext x) to fold out, and to constant fold any constant
   // sources.
   Src0 = Builder.buildFPTrunc(LLT::scalar(16), Src0).getReg(0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
index 6510abe9d2321..30601126e833b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
@@ -23,13 +23,13 @@ class AMDGPUCombinerHelper : public CombinerHelper {
 public:
   using CombinerHelper::CombinerHelper;
 
-  bool matchFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo);
-  void applyFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo);
+  bool matchFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo) const;
+  void applyFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo) const;
 
   bool matchExpandPromotedF16FMed3(MachineInstr &MI, Register Src0,
-                                   Register Src1, Register Src2);
+                                   Register Src1, Register Src2) const;
   void applyExpandPromotedF16FMed3(MachineInstr &MI, Register Src0,
-                                   Register Src1, Register Src2);
+                                   Register Src1, Register Src2) const;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index ff8189ce31f7f..ac431ccc30903 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -45,8 +45,7 @@ class AMDGPUPreLegalizerCombinerImpl : public Combiner {
 protected:
   const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig;
   const GCNSubtarget &STI;
-  // TODO: Make CombinerHelper methods const.
-  mutable AMDGPUCombinerHelper Helper;
+  const AMDGPUCombinerHelper Helper;
 
 public:
   AMDGPUPreLegalizerCombinerImpl(

From 0494c8dc832273017f5d8151f2617f6a3df396ca Mon Sep 17 00:00:00 2001
From: Lee Wei <lee10202013@gmail.com>
Date: Mon, 6 Jan 2025 01:43:39 -0700
Subject: [PATCH 02/49] [llvm] Remove `br i1 undef` from CodeGen/X86 tests
 (#121733)

This PR removes tests with `br i1 undef` under `llvm/tests/CodeGen/X86`.
There will be more PRs in the future for this directory.

Replacing `undef` with a new function argument doesn't work in some of
the tests, instead, I've replaced them with `poison`.
---
 llvm/test/CodeGen/X86/2011-06-03-x87chain.ll  |  4 +-
 .../X86/2020_12_02_decrementing_loop.ll       |  8 +-
 .../test/CodeGen/X86/AMX/amx-combine-undef.ll | 76 +++++++++----------
 .../X86/AMX/lat-combine-amx-bitcast.ll        | 62 +++++++--------
 .../X86/AMX/lat-transform-amx-bitcast.ll      | 10 +--
 llvm/test/CodeGen/X86/StackColoring.ll        |  2 +-
 llvm/test/CodeGen/X86/asm-label.ll            | 10 +--
 llvm/test/CodeGen/X86/avx-select.ll           |  2 +-
 llvm/test/CodeGen/X86/avx512-i1test.ll        | 10 +--
 llvm/test/CodeGen/X86/block-placement.ll      | 28 +++----
 llvm/test/CodeGen/X86/clobber_frame_ptr.ll    |  3 +-
 .../X86/codegen-prepare-replacephi.mir        |  6 +-
 .../X86/codegen-prepare-replacephi2.mir       |  6 +-
 .../test/CodeGen/X86/combine-concatvectors.ll |  2 +-
 llvm/test/CodeGen/X86/crash.ll                | 48 ++++++------
 .../CodeGen/X86/domain-reassignment-test.ll   |  4 +-
 llvm/test/CodeGen/X86/fast-isel-cmp-branch.ll |  4 +-
 .../CodeGen/X86/fold-vector-shuffle-crash.ll  |  6 +-
 llvm/test/CodeGen/X86/hoist-spill.ll          | 12 +--
 .../test/CodeGen/X86/implicit-null-checks.mir | 30 ++++----
 .../test/CodeGen/X86/interval-update-remat.ll |  6 +-
 llvm/test/CodeGen/X86/jump_sign.ll            | 11 ++-
 .../CodeGen/X86/loop-strength-reduce-crash.ll |  4 +-
 llvm/test/CodeGen/X86/lsr-crash-empty-uses.ll |  4 +-
 llvm/test/CodeGen/X86/lsr-delayed-fold.ll     | 20 ++---
 .../X86/machine-trace-metrics-crash.ll        |  4 +-
 .../merge-vector-stores-scale-idx-crash.ll    |  4 +-
 llvm/test/CodeGen/X86/misched-crash.ll        |  4 +-
 llvm/test/CodeGen/X86/pr10475.ll              |  8 +-
 llvm/test/CodeGen/X86/pr11998.ll              |  4 +-
 llvm/test/CodeGen/X86/pr32108.ll              |  2 +-
 llvm/test/CodeGen/X86/pr50254.ll              |  2 +-
 llvm/test/CodeGen/X86/pr57673.ll              |  2 +-
 llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll | 30 ++++----
 llvm/test/CodeGen/X86/shift-combine.ll        |  2 +-
 .../test/CodeGen/X86/shuffle-combine-crash.ll |  3 +-
 llvm/test/CodeGen/X86/stackmap.ll             |  8 +-
 llvm/test/CodeGen/X86/swifterror.ll           |  2 +-
 llvm/test/CodeGen/X86/switch.ll               |  2 +-
 .../CodeGen/X86/tail-merge-unreachable.ll     |  4 +-
 .../CodeGen/X86/unreachable-loop-sinking.ll   |  4 +-
 llvm/test/CodeGen/X86/update-terminator.mir   |  6 +-
 .../vector-shuffle-combining-avx512bwvl.ll    |  2 +-
 llvm/test/CodeGen/X86/x86-shrink-wrapping.ll  | 10 +--
 44 files changed, 238 insertions(+), 243 deletions(-)

diff --git a/llvm/test/CodeGen/X86/2011-06-03-x87chain.ll b/llvm/test/CodeGen/X86/2011-06-03-x87chain.ll
index 67fd59ed4c263..ed3dcad227bcd 100644
--- a/llvm/test/CodeGen/X86/2011-06-03-x87chain.ll
+++ b/llvm/test/CodeGen/X86/2011-06-03-x87chain.ll
@@ -30,9 +30,9 @@ entry:
   ret float %conv
 }
 
-define void @PR17495() {
+define void @PR17495(i1 %arg) {
 entry:
-  br i1 undef, label %while.end, label %while.body
+  br i1 %arg, label %while.end, label %while.body
 
 while.body:                                       ; preds = %while.body, %entry
   %x.1.copyload = load i24, ptr undef, align 1
diff --git a/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll b/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
index 22bf4581c6b42..49de5091f0e5f 100644
--- a/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
+++ b/llvm/test/CodeGen/X86/2020_12_02_decrementing_loop.ll
@@ -165,7 +165,7 @@ failure:                                          ; preds = %backedge
   unreachable
 }
 
-define void @test_04() {
+define void @test_04(i32 %arg) {
 ; CHECK-LABEL: test_04:
 ; CHECK:       ## %bb.0: ## %bb
 ; CHECK-NEXT:    ud2
@@ -175,7 +175,7 @@ bb:
 bb1:                                              ; preds = %bb10, %bb
   %tmp = phi i64 [ 1, %bb ], [ %tmp2, %bb10 ]
   %tmp2 = add nuw nsw i64 %tmp, 1
-  br i1 undef, label %bb21, label %bb7
+  br i1 poison, label %bb21, label %bb7
 
 bb7:                                              ; preds = %bb1
   %tmp8 = add nsw i64 %tmp, -1
@@ -187,7 +187,7 @@ bb10:                                             ; preds = %bb16
   br label %bb1
 
 bb11:                                             ; preds = %bb16, %bb7
-  switch i32 undef, label %bb19 [
+  switch i32 %arg, label %bb19 [
     i32 0, label %bb17
     i32 1, label %bb16
     i32 2, label %bb15
@@ -205,7 +205,7 @@ bb15:                                             ; preds = %bb11
   unreachable
 
 bb16:                                             ; preds = %bb11
-  br i1 undef, label %bb10, label %bb11
+  br i1 poison, label %bb10, label %bb11
 
 bb17:                                             ; preds = %bb11
   unreachable
diff --git a/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll b/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll
index 86874b14b3612..faa119cd037f1 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll
@@ -5,13 +5,13 @@ define void @undef_2phi(ptr%buf) {
 ; CHECK-LABEL: @undef_2phi(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[L3:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[L3:%.*]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi x86_amx [ [[TMP0]], [[ENTRY:%.*]] ], [ [[T1]], [[L1]] ]
-; CHECK-NEXT:    br i1 undef, label [[L3]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L3]], label [[EXIT:%.*]]
 ; CHECK:       l3:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi x86_amx [ [[TMP1]], [[L2]] ], [ [[T1]], [[L1]] ]
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP2]])
@@ -20,16 +20,16 @@ define void @undef_2phi(ptr%buf) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %l3
+  br i1 poison, label %l2, label %l3
 
 l2:
-  %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
-  br i1 undef, label %l3, label %exit
+  %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ]
+  br i1 poison, label %l3, label %exit
 
 l3:
   %t4 = phi <256 x i32> [ %t3, %l2], [ %t2, %l1 ]
@@ -45,10 +45,10 @@ define void @foo_undef(ptr%buf) {
 ; CHECK-LABEL: @foo_undef(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi x86_amx [ [[TMP0]], [[ENTRY:%.*]] ], [ [[T1]], [[L1]] ]
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP1]])
@@ -57,15 +57,15 @@ define void @foo_undef(ptr%buf) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 poison, label %l2, label %exit
 
 l2:
-  %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
+  %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ]
   %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3)
   call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr %buf, i64 1024, x86_amx %t4)
   br label %exit
@@ -78,10 +78,10 @@ define void @foo_zero(ptr%buf) {
 ; CHECK-LABEL: @foo_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi x86_amx [ [[TMP0]], [[ENTRY:%.*]] ], [ [[T1]], [[L1]] ]
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP1]])
@@ -90,12 +90,12 @@ define void @foo_zero(ptr%buf) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 poison, label %l2, label %exit
 
 l2:
   %t3 = phi <256 x i32> [ zeroinitializer, %entry ], [ %t2, %l1 ]
@@ -112,14 +112,14 @@ define void @foo_vrow(ptr%buf, i16 %row) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca <256 x i32>, align 64
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW:%.*]], i16 32)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 32, ptr [[TMP1]], i64 32, x86_amx [[T1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <256 x i32>, ptr [[TMP1]], align 1024
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP3]], [[L1]] ]
+; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ poison, [[ENTRY:%.*]] ], [ [[TMP3]], [[L1]] ]
 ; CHECK-NEXT:    store <256 x i32> [[T3]], ptr [[TMP0]], align 1024
 ; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 32, ptr [[TMP0]], i64 32)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP5]])
@@ -128,15 +128,15 @@ define void @foo_vrow(ptr%buf, i16 %row) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 poison, label %l2, label %exit
 
 l2:
-  %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
+  %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ]
   %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3)
   call void @llvm.x86.tilestored64.internal(i16 %row, i16 32, ptr %buf, i64 1024, x86_amx %t4)
   br label %exit
@@ -150,13 +150,13 @@ define void @foo_vcol(ptr%buf, i16 %col) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca <256 x i32>, align 64
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 [[COL:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[COL]] to i64
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 [[COL]], ptr [[TMP1]], i64 [[TMP3]], x86_amx [[T1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <256 x i32>, ptr [[TMP1]], align 1024
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP4]], [[L1]] ]
 ; CHECK-NEXT:    store <256 x i32> [[T3]], ptr [[TMP0]], align 1024
@@ -168,12 +168,12 @@ define void @foo_vcol(ptr%buf, i16 %col) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 %col)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 poison, label %l2, label %exit
 
 l2:
   %t3 = phi <256 x i32> [ zeroinitializer, %entry ], [ %t2, %l1 ]
@@ -189,29 +189,29 @@ define void @noshape(ptr%buf) {
 ; CHECK-LABEL: @noshape(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, x86_amx [[T1]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP2]], [[L1]] ]
+; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ poison, [[ENTRY:%.*]] ], [ [[TMP2]], [[L1]] ]
 ; CHECK-NEXT:    store <256 x i32> [[T3]], ptr [[BUF:%.*]], align 1024
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 poison, label %l2, label %exit
 
 l2:
-  %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
+  %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ]
   %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3)
   %t5 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t4)
   store <256 x i32> %t5, ptr %buf
@@ -225,14 +225,14 @@ define void @noshape2(ptr%buf) {
 ; CHECK-LABEL: @noshape2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, x86_amx [[T1]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 poison, label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP2]], [[L1]] ]
+; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ poison, [[ENTRY:%.*]] ], [ [[TMP2]], [[L1]] ]
 ; CHECK-NEXT:    [[T6:%.*]] = call <256 x i32> @llvm.abs.v256i32(<256 x i32> [[T3]], i1 true)
 ; CHECK-NEXT:    store <256 x i32> [[T6]], ptr [[BUF:%.*]], align 1024
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -240,15 +240,15 @@ define void @noshape2(ptr%buf) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 poison, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 poison, label %l2, label %exit
 
 l2:
-  %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
+  %t3 = phi <256 x i32> [ poison, %entry ], [ %t2, %l1 ]
   %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3)
   %t5 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t4)
   %t6 = call <256 x i32> @llvm.abs.v256i32(<256 x i32> %t5, i1 1)
diff --git a/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll
index b2eb5fd915b96..b70668f7a3dea 100644
--- a/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll
@@ -18,14 +18,14 @@ wrapper_entry:
 
 ; Cases where amxcast can be combined across bb
 ; %5 and %6 is combined together since %goodphi's incoming is phi or amxcast
-define void @combine_amx_cast_and_phi() {
+define void @combine_amx_cast_and_phi(i1 %arg) {
 ; CHECK-LABEL: @combine_amx_cast_and_phi(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <560 x i8>, align 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca <616 x i8>, align 64
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca <110 x i32>, align 64
 ; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
 ; CHECK:       for.body.i.lr.ph.i:
 ; CHECK-NEXT:    store <110 x i32> undef, ptr [[TMP2]], align 512
 ; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40)
@@ -43,7 +43,7 @@ define void @combine_amx_cast_and_phi() {
 wrapper_entry:
   %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
   %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
-  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+  br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
 
 for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
@@ -62,7 +62,7 @@ for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i,
 
 ; Cases where amxcast can't be combined across bb
 ; %5 and %6 is not combined together since %evilphi's incoming is not phi or amxcast
-define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp) {
+define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp, i1 %arg) {
 ; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <110 x i32>, align 64
@@ -71,7 +71,7 @@ define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = alloca <616 x i8>, align 64
 ; CHECK-NEXT:    [[TMP4:%.*]] = alloca <110 x i32>, align 64
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <110 x i32> [[TMP:%.*]], [[TMP]]
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
 ; CHECK:       for.body.i.lr.ph.i:
 ; CHECK-NEXT:    store <110 x i32> undef, ptr [[TMP4]], align 512
 ; CHECK-NEXT:    [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP4]], i64 40)
@@ -92,7 +92,7 @@ define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp) {
 ;
 wrapper_entry:
   %0 = add <110 x i32> %tmp, %tmp
-  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+  br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
 
 for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
@@ -111,7 +111,7 @@ for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i,
 
 ; Cases where amxcast can't be combined across bb
 ; %5 and %6 is not combined together since %evilphi's user aka %evilphi2 is not inside phi web.
-define void @fail_to_combine_amx_cast_and_phi2() {
+define void @fail_to_combine_amx_cast_and_phi2(i1 %arg) {
 ; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi2(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <110 x i32>, align 64
@@ -123,7 +123,7 @@ define void @fail_to_combine_amx_cast_and_phi2() {
 ; CHECK-NEXT:    [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP5]], i64 40, x86_amx [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <110 x i32>, ptr [[TMP5]], align 512
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
 ; CHECK:       for.body.i.lr.ph.i:
 ; CHECK-NEXT:    store <110 x i32> undef, ptr [[TMP4]], align 512
 ; CHECK-NEXT:    [[TMP10:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP4]], i64 40)
@@ -134,13 +134,13 @@ define void @fail_to_combine_amx_cast_and_phi2() {
 ; CHECK-NEXT:    [[TMP15:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP10]], x86_amx [[TMP12]], x86_amx [[TMP14]])
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP1]], i64 40, x86_amx [[TMP15]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <110 x i32>, ptr [[TMP1]], align 512
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]]
 ; CHECK:       for.cond.cleanup.i.i:
 ; CHECK-NEXT:    [[GOODPHI:%.*]] = phi <110 x i32> [ [[TMP8]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP17]], [[FOR_BODY_I_LR_PH_I]] ]
 ; CHECK-NEXT:    store <110 x i32> [[GOODPHI]], ptr [[TMP0]], align 512
 ; CHECK-NEXT:    [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP0]], i64 40)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP19]])
-; CHECK-NEXT:    br i1 undef, label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[EVILPHI2:%.*]] = phi <110 x i32> [ [[GOODPHI]], [[FOR_COND_CLEANUP_I_I]] ], [ [[TMP17]], [[FOR_BODY_I_LR_PH_I]] ]
 ; CHECK-NEXT:    store <110 x i32> [[EVILPHI2]], ptr undef, align 512
@@ -149,7 +149,7 @@ define void @fail_to_combine_amx_cast_and_phi2() {
 wrapper_entry:
   %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
   %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
-  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+  br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
 
 for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
@@ -157,27 +157,27 @@ for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
   %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
   %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
-  br i1 undef, label %for.cond.cleanup.i.i, label %exit
+  br i1 %arg, label %for.cond.cleanup.i.i, label %exit
 
 for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i, %wrapper_entry
   %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ]
   %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi)
   call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %6)
-  br i1 undef, label %exit, label %for.body.i.lr.ph.i
+  br i1 %arg, label %exit, label %for.body.i.lr.ph.i
 exit:
   %evilphi2 = phi <110 x i32> [ %goodphi, %for.cond.cleanup.i.i ], [ %5, %for.body.i.lr.ph.i ]
   store <110 x i32> %evilphi2, ptr undef, align 512
   ret void
 }
 
-define void @fail_to_combine_amx_cast_and_phi_due_to_const_value() {
+define void @fail_to_combine_amx_cast_and_phi_due_to_const_value(i1 %arg) {
 ; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi_due_to_const_value(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <560 x i8>, align 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca <616 x i8>, align 64
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca <110 x i32>, align 64
 ; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 11, i16 40)
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
 ; CHECK:       for.body.i.lr.ph.i:
 ; CHECK-NEXT:    store <110 x i32> undef, ptr [[TMP2]], align 512
 ; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40)
@@ -193,7 +193,7 @@ define void @fail_to_combine_amx_cast_and_phi_due_to_const_value() {
 ; CHECK-NEXT:    ret void
 ;
 wrapper_entry:
-  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+  br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
 
 for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %0 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
@@ -213,14 +213,14 @@ for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i,
 ; Cases where amxcast can be combined across bb
 ; When optimizeAMXCastFromPhi process %6 and %goodphi, %goodphi2 is outside the phi-web, so the optimization stop
 ; When optimizeAMXCastFromPhi process %7 and %goodphi2, the optimization continue.
-define void @combine_amx_cast_and_multiple_phi() {
+define void @combine_amx_cast_and_multiple_phi(i1 %arg) {
 ; CHECK-LABEL: @combine_amx_cast_and_multiple_phi(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <560 x i8>, align 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca <616 x i8>, align 64
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca <110 x i32>, align 64
 ; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
 ; CHECK:       for.body.i.lr.ph.i:
 ; CHECK-NEXT:    store <110 x i32> undef, ptr [[TMP2]], align 512
 ; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40)
@@ -229,11 +229,11 @@ define void @combine_amx_cast_and_multiple_phi() {
 ; CHECK-NEXT:    store <560 x i8> undef, ptr [[TMP0]], align 1024
 ; CHECK-NEXT:    [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP0]], i64 40)
 ; CHECK-NEXT:    [[TMP10:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP5]], x86_amx [[TMP7]], x86_amx [[TMP9]])
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]]
 ; CHECK:       for.cond.cleanup.i.i:
 ; CHECK-NEXT:    [[TMP11:%.*]] = phi x86_amx [ [[TMP3]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ]
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP11]])
-; CHECK-NEXT:    br i1 undef, label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP12:%.*]] = phi x86_amx [ [[TMP11]], [[FOR_COND_CLEANUP_I_I]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ]
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP12]])
@@ -242,7 +242,7 @@ define void @combine_amx_cast_and_multiple_phi() {
 wrapper_entry:
   %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
   %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
-  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+  br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
 
 for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
@@ -250,13 +250,13 @@ for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
   %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
   %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
-  br i1 undef, label %for.cond.cleanup.i.i, label %exit
+  br i1 %arg, label %for.cond.cleanup.i.i, label %exit
 
 for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i, %wrapper_entry
   %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ]
   %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi)
   call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %6)
-  br i1 undef, label %exit, label %for.body.i.lr.ph.i
+  br i1 %arg, label %exit, label %for.body.i.lr.ph.i
 exit:
   %evilphi2 = phi <110 x i32> [ %goodphi, %for.cond.cleanup.i.i ], [ %5, %for.body.i.lr.ph.i ]
   %7 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2)
@@ -265,7 +265,7 @@ exit:
 }
 
 ; Currently we are not able to delete DeadPHICycle, later we will handle with them
-define void @combine_amx_cast_and_phi_in_a_circle() {
+define void @combine_amx_cast_and_phi_in_a_circle(i1 %arg) {
 ; CHECK-LABEL: @combine_amx_cast_and_phi_in_a_circle(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <110 x i32>, align 64
@@ -284,7 +284,7 @@ define void @combine_amx_cast_and_phi_in_a_circle() {
 ; CHECK-NEXT:    [[TMP11:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP6]], x86_amx [[TMP8]], x86_amx [[TMP10]])
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP0]], i64 40, x86_amx [[TMP11]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <110 x i32>, ptr [[TMP0]], align 512
-; CHECK-NEXT:    br i1 undef, label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi x86_amx [ [[TMP15:%.*]], [[BB3]] ], [ [[TMP11]], [[BB1]] ]
 ; CHECK-NEXT:    [[GOODPHI:%.*]] = phi <110 x i32> [ [[EVILPHI2:%.*]], [[BB3]] ], [ [[TMP13]], [[BB1]] ]
@@ -294,7 +294,7 @@ define void @combine_amx_cast_and_phi_in_a_circle() {
 ; CHECK-NEXT:    [[TMP15]] = phi x86_amx [ [[TMP14]], [[BB2]] ], [ [[TMP11]], [[BB1]] ]
 ; CHECK-NEXT:    [[EVILPHI2]] = phi <110 x i32> [ [[GOODPHI]], [[BB2]] ], [ [[TMP13]], [[BB1]] ]
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP15]])
-; CHECK-NEXT:    br i1 undef, label [[BB2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[BB2]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP15]])
 ; CHECK-NEXT:    ret void
@@ -310,7 +310,7 @@ bb1:                               ; preds = %wrapper_entry
   %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
   %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
   %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
-  br i1 undef, label %bb2, label %bb3
+  br i1 %arg, label %bb2, label %bb3
 
 bb2:                             ; preds = %bb1, %wrapper_entry
   %goodphi = phi <110 x i32> [ %evilphi2, %bb3], [ %5, %bb1 ]
@@ -321,19 +321,19 @@ bb3:
   %evilphi2 = phi <110 x i32> [ %goodphi, %bb2 ], [ %5, %bb1 ]
   %7 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2)
   call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %7)
-  br i1 undef, label %bb2, label %exit
+  br i1 %arg, label %bb2, label %exit
 exit:
   %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2)
   call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %8)
   ret void
 }
 
-define void @eliminate_unused_phi_and_cast() {
+define void @eliminate_unused_phi_and_cast(i1 %arg) {
 ; CHECK-LABEL: @eliminate_unused_phi_and_cast(
 ; CHECK-NEXT:  wrapper_entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <560 x i8>, align 64
 ; CHECK-NEXT:    [[TMP1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
-; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
 ; CHECK:       for.body.i.lr.ph.i:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr undef, i64 undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr undef, i64 undef)
@@ -349,7 +349,7 @@ define void @eliminate_unused_phi_and_cast() {
 wrapper_entry:
   %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef)
   %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
-  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+  br i1 %arg, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
 
 for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
   %1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr undef, i64 undef)
diff --git a/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll
index 391727d54a03a..3a5b424540ff1 100644
--- a/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll
@@ -317,16 +317,16 @@ define dso_local void @__tile_stored(ptr %0, i64 %1, ptr nocapture readonly byva
   ret void
 }
 
-define void @dead_code(ptr%buf) {
+define void @dead_code(ptr%buf, i1 %arg) {
 ; CHECK-LABEL: @dead_code(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
-; CHECK-NEXT:    br i1 undef, label [[L1:%.*]], label [[L2:%.*]]
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
 ; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
 ; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, x86_amx [[T1]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024
-; CHECK-NEXT:    br i1 undef, label [[L2]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br i1 [[ARG]], label [[L2]], label [[EXIT:%.*]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP1]], [[L1]] ]
 ; CHECK-NEXT:    store <256 x i32> [[T3]], ptr [[BUF:%.*]], align 1024
@@ -335,12 +335,12 @@ define void @dead_code(ptr%buf) {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  br i1 undef, label %l1, label %l2
+  br i1 %arg, label %l1, label %l2
 
 l1:
   %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32)
   %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
-  br i1 undef, label %l2, label %exit
+  br i1 %arg, label %l2, label %exit
 
 l2:
   %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ]
diff --git a/llvm/test/CodeGen/X86/StackColoring.ll b/llvm/test/CodeGen/X86/StackColoring.ll
index 389d024dafd11..db3e7dcdfe2d5 100644
--- a/llvm/test/CodeGen/X86/StackColoring.ll
+++ b/llvm/test/CodeGen/X86/StackColoring.ll
@@ -135,7 +135,7 @@ entry:
   %t3 = call i32 @foo(i32 %in, ptr %a3)
   %t4 = call i32 @foo(i32 %in, ptr %a3)
   call void @llvm.lifetime.end.p0(i64 -1, ptr %a3)
-  br i1 undef, label %bb2, label %bb3
+  br i1 poison, label %bb2, label %bb3
 bb2:
   call void @llvm.lifetime.start.p0(i64 -1, ptr %a4)
   %t11 = call i32 @foo(i32 %in, ptr %a4)
diff --git a/llvm/test/CodeGen/X86/asm-label.ll b/llvm/test/CodeGen/X86/asm-label.ll
index 05c37db532f87..2d3e7b624d354 100644
--- a/llvm/test/CodeGen/X86/asm-label.ll
+++ b/llvm/test/CodeGen/X86/asm-label.ll
@@ -12,15 +12,15 @@
 ; SAVETEMP:         jne {{.*}} <.LBB0_1>
 ; SAVETEMP-LABEL: <.LBB0_1>:
 
-define void @foo()  {
+define void @foo(i1 %arg, i32 %arg2)  {
 entry:
-  br i1 undef, label %land.lhs.true, label %if.end11
+  br i1 %arg, label %land.lhs.true, label %if.end11
 
 land.lhs.true:                                    ; preds = %entry
-  br i1 undef, label %if.then, label %if.end11
+  br i1 %arg, label %if.then, label %if.end11
 
 if.then:                                          ; preds = %land.lhs.true
-  br i1 undef, label %if.then9, label %if.end
+  br i1 %arg, label %if.then9, label %if.end
 
 if.then9:                                         ; preds = %if.then
   br label %cleanup
@@ -29,7 +29,7 @@ if.end:                                           ; preds = %if.then
   br label %cleanup
 
 cleanup:                                          ; preds = %if.end, %if.then9
-  switch i32 undef, label %default [
+  switch i32 %arg2, label %default [
     i32 0, label %cleanup.cont
     i32 1, label %if.end11
   ]
diff --git a/llvm/test/CodeGen/X86/avx-select.ll b/llvm/test/CodeGen/X86/avx-select.ll
index 7a33daf18be87..1b688c8cf9cca 100644
--- a/llvm/test/CodeGen/X86/avx-select.ll
+++ b/llvm/test/CodeGen/X86/avx-select.ll
@@ -84,7 +84,7 @@ head:
   %isneg = icmp slt <4 x i32> %v3, zeroinitializer
   %or0 = select <4 x i1> %isneg, <4 x i32> <i32 26146, i32 -1257, i32 -2, i32 -3052>, <4 x i32> <i32 -24947, i32 7802, i32 29242, i32 15858>
   %or1 = shufflevector <4 x i32> %or0, <4 x i32> <i32 29361, i32 -16094, i32 -3080, i32 -26286>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  br i1 undef, label %exit, label %head
+  br i1 poison, label %exit, label %head
 
 exit:
   store <8 x i32> %or1, ptr addrspace(1) undef, align 32
diff --git a/llvm/test/CodeGen/X86/avx512-i1test.ll b/llvm/test/CodeGen/X86/avx512-i1test.ll
index 3cd733181599e..d8683df5cbf7a 100644
--- a/llvm/test/CodeGen/X86/avx512-i1test.ll
+++ b/llvm/test/CodeGen/X86/avx512-i1test.ll
@@ -21,20 +21,20 @@ define void @func() {
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    jmp .LBB0_2
 bb1:
-  br i1 undef, label %L_10, label %L_10
+  br i1 poison, label %L_10, label %L_10
 
 L_10:                                             ; preds = %bb1, %bb1
-  br i1 undef, label %L_30, label %bb56
+  br i1 poison, label %L_30, label %bb56
 
 bb56:                                             ; preds = %L_10
   br label %bb33
 
 bb33:                                             ; preds = %bb51, %bb56
   %r111 = load i64, ptr undef, align 8
-  br i1 undef, label %bb51, label %bb35
+  br i1 poison, label %bb51, label %bb35
 
 bb35:                                             ; preds = %bb33
-  br i1 undef, label %L_19, label %bb37
+  br i1 poison, label %L_19, label %bb37
 
 bb37:                                             ; preds = %bb35
   %r128 = and i64 %r111, 576460752303423488
@@ -43,7 +43,7 @@ bb37:                                             ; preds = %bb35
 
 L_19:                                             ; preds = %bb37, %bb35
   %"$V_S25.0" = phi i1 [ %phitmp, %bb37 ], [ true, %bb35 ]
-  br i1 undef, label %bb51, label %bb42
+  br i1 poison, label %bb51, label %bb42
 
 bb42:                                             ; preds = %L_19
   %r136 = select i1 %"$V_S25.0", ptr undef, ptr undef
diff --git a/llvm/test/CodeGen/X86/block-placement.ll b/llvm/test/CodeGen/X86/block-placement.ll
index 675293410dfe7..1369131413053 100644
--- a/llvm/test/CodeGen/X86/block-placement.ll
+++ b/llvm/test/CodeGen/X86/block-placement.ll
@@ -312,7 +312,7 @@ exit:
   ret i32 %sum
 }
 
-define void @unnatural_cfg1() {
+define void @unnatural_cfg1(i1 %arg) {
 ; Test that we can handle a loop with an inner unnatural loop at the end of
 ; a function. This is a gross CFG reduced out of the single source GCC.
 ; CHECK-LABEL: unnatural_cfg1
@@ -327,7 +327,7 @@ loop.header:
   br label %loop.body1
 
 loop.body1:
-  br i1 undef, label %loop.body3, label %loop.body2
+  br i1 %arg, label %loop.body3, label %loop.body2
 
 loop.body2:
   %ptr = load ptr, ptr undef, align 4
@@ -341,14 +341,14 @@ loop.body3:
   br i1 %comp, label %loop.body4, label %loop.body5
 
 loop.body4:
-  br i1 undef, label %loop.header, label %loop.body5
+  br i1 %arg, label %loop.header, label %loop.body5
 
 loop.body5:
   %ptr2 = load ptr, ptr undef, align 4
   br label %loop.body3
 }
 
-define void @unnatural_cfg2(ptr %p0, i32 %a0) {
+define void @unnatural_cfg2(ptr %p0, i32 %a0, i1 %arg) {
 ; Test that we can handle a loop with a nested natural loop *and* an unnatural
 ; loop. This was reduced from a crash on block placement when run over
 ; single-source GCC.
@@ -372,10 +372,10 @@ loop.header:
 
 loop.body1:
   %val0 = load ptr, ptr undef, align 4
-  br i1 undef, label %loop.body2, label %loop.inner1.begin
+  br i1 %arg, label %loop.body2, label %loop.inner1.begin
 
 loop.body2:
-  br i1 undef, label %loop.body4, label %loop.body3
+  br i1 %arg, label %loop.body4, label %loop.body3
 
 loop.body3:
   %ptr1 = getelementptr inbounds i32, ptr %val0, i32 0
@@ -467,7 +467,7 @@ exit:
   ret i32 %merge
 }
 
-define void @fpcmp_unanalyzable_branch(i1 %cond, double %a0) {
+define void @fpcmp_unanalyzable_branch(i1 %cond, double %a0, i1 %arg) {
 ; This function's CFG contains an once-unanalyzable branch (une on floating
 ; points). As now it becomes analyzable, we should get best layout in which each
 ; edge in 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end' is
@@ -493,7 +493,7 @@ entry.if.then_crit_edge:
   br label %if.then
 
 lor.lhs.false:
-  br i1 undef, label %if.end, label %exit
+  br i1 %arg, label %if.end, label %exit
 
 exit:
   %cmp.i = fcmp une double 0.000000e+00, %a0
@@ -516,7 +516,7 @@ declare i32 @f()
 declare i32 @g()
 declare i32 @h(i32 %x)
 
-define i32 @test_global_cfg_break_profitability() {
+define i32 @test_global_cfg_break_profitability(i1 %arg) {
 ; Check that our metrics for the profitability of a CFG break are global rather
 ; than local. A successor may be very hot, but if the current block isn't, it
 ; doesn't matter. Within this test the 'then' block is slightly warmer than the
@@ -530,7 +530,7 @@ define i32 @test_global_cfg_break_profitability() {
 ; CHECK: ret
 
 entry:
-  br i1 undef, label %then, label %else, !prof !2
+  br i1 %arg, label %then, label %else, !prof !2
 
 then:
   %then.result = call i32 @f()
@@ -600,7 +600,7 @@ cleanup:
   unreachable
 }
 
-define void @test_unnatural_cfg_backwards_inner_loop() {
+define void @test_unnatural_cfg_backwards_inner_loop(i1 %arg) {
 ; Test that when we encounter an unnatural CFG structure after having formed
 ; a chain for an inner loop which happened to be laid out backwards we don't
 ; attempt to merge onto the wrong end of the inner loop just because we find it
@@ -612,7 +612,7 @@ define void @test_unnatural_cfg_backwards_inner_loop() {
 ; CHECK: %loop3
 
 entry:
-  br i1 undef, label %loop2a, label %body
+  br i1 %arg, label %loop2a, label %body
 
 body:
   br label %loop2a
@@ -692,7 +692,7 @@ exit:
   ret void
 }
 
-define void @unanalyzable_branch_to_free_block(float %x) {
+define void @unanalyzable_branch_to_free_block(float %x, i1 %arg) {
 ; Ensure that we can handle unanalyzable branches where the destination block
 ; gets selected as the best free block in the CFG.
 ;
@@ -704,7 +704,7 @@ define void @unanalyzable_branch_to_free_block(float %x) {
 ; CHECK: %exit
 
 entry:
-  br i1 undef, label %a, label %b
+  br i1 %arg, label %a, label %b
 
 a:
   call i32 @f()
diff --git a/llvm/test/CodeGen/X86/clobber_frame_ptr.ll b/llvm/test/CodeGen/X86/clobber_frame_ptr.ll
index f6b38839d13cc..e7ffc47527150 100644
--- a/llvm/test/CodeGen/X86/clobber_frame_ptr.ll
+++ b/llvm/test/CodeGen/X86/clobber_frame_ptr.ll
@@ -173,7 +173,7 @@ define ghccc void @test5() {
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
 ; CHECK-NEXT:    jmp tail@PLT # TAILCALL
 entry:
-  br i1 undef, label %then, label %else
+  br i1 poison, label %then, label %else
 
 then:
   store i64 0, ptr undef
@@ -186,4 +186,3 @@ else:
 exit:
   ret void
 }
-
diff --git a/llvm/test/CodeGen/X86/codegen-prepare-replacephi.mir b/llvm/test/CodeGen/X86/codegen-prepare-replacephi.mir
index aceb344d8b76c..13f3f3ad4187f 100644
--- a/llvm/test/CodeGen/X86/codegen-prepare-replacephi.mir
+++ b/llvm/test/CodeGen/X86/codegen-prepare-replacephi.mir
@@ -6,7 +6,7 @@
 # "Replacement PHI node is already replaced."
 
 --- |
-  define void @f1() {
+  define void @f1(i1 %arg) {
   entry:
     %arrayidx = getelementptr inbounds [2 x i16], ptr undef, i16 0, i16 2
     br label %for.cond
@@ -30,10 +30,10 @@
     %5 = phi ptr [ %4, %for.body ], [ %5, %if.then5 ], [ undef, %for.cond2 ]
     %6 = phi ptr [ %3, %for.body ], [ %6, %if.then5 ], [ undef, %for.cond2 ]
     %7 = phi ptr [ %2, %for.body ], [ %6, %if.then5 ], [ undef, %for.cond2 ]
-    br i1 undef, label %for.cond2, label %if.then5
+    br i1 %arg, label %for.cond2, label %if.then5
 
   if.then5:
-    br i1 undef, label %cleanup, label %for.cond2
+    br i1 %arg, label %cleanup, label %for.cond2
 
   cleanup:
     br i1 true, label %for.cond, label %for.body
diff --git a/llvm/test/CodeGen/X86/codegen-prepare-replacephi2.mir b/llvm/test/CodeGen/X86/codegen-prepare-replacephi2.mir
index 6159aa8a42e2b..e93e04bfd443e 100644
--- a/llvm/test/CodeGen/X86/codegen-prepare-replacephi2.mir
+++ b/llvm/test/CodeGen/X86/codegen-prepare-replacephi2.mir
@@ -7,7 +7,7 @@
 
 --- |
 
-  define void @f1() {
+  define void @f1(i1 %arg) {
   entry:
     %arrayidx = getelementptr inbounds [2 x i16], ptr undef, i16 0, i16 2
     br label %for.cond
@@ -24,7 +24,7 @@
     %2 = phi ptr [ %1, %for.cond ], [ %12, %cleanup ]
     %3 = phi ptr [ %0, %for.cond ], [ %11, %cleanup ]
     %4 = phi ptr [ %0, %for.cond ], [ %10, %cleanup ]
-    br i1 undef, label %for.cond2.preheader, label %if.then
+    br i1 %arg, label %for.cond2.preheader, label %if.then
 
   for.cond2.preheader:
     br label %for.cond2
@@ -37,7 +37,7 @@
     %5 = phi ptr [ %8, %for.inc ], [ %4, %for.cond2.preheader ]
     %6 = phi ptr [ %9, %for.inc ], [ %3, %for.cond2.preheader ]
     %7 = phi ptr [ %9, %for.inc ], [ %2, %for.cond2.preheader ]
-    br i1 undef, label %for.inc, label %if.then5
+    br i1 %arg, label %for.inc, label %if.then5
 
   if.then5:
     br i1 true, label %cleanup.loopexit, label %if.end
diff --git a/llvm/test/CodeGen/X86/combine-concatvectors.ll b/llvm/test/CodeGen/X86/combine-concatvectors.ll
index 230afd1461935..7237b02ca6b66 100644
--- a/llvm/test/CodeGen/X86/combine-concatvectors.ll
+++ b/llvm/test/CodeGen/X86/combine-concatvectors.ll
@@ -72,7 +72,7 @@ alloca_0:
   br label %loop.4942
 
 loop.4942:                                        ; preds = %loop.4942, %alloca_0
-  br i1 undef, label %loop.4942, label %ifmerge.1298
+  br i1 poison, label %loop.4942, label %ifmerge.1298
 
 ifmerge.1298:                                     ; preds = %loop.4942
   %gepload4638 = load float, ptr getelementptr inbounds ([49216 x i8], ptr @qa_, i64 0, i64 28324), align 4
diff --git a/llvm/test/CodeGen/X86/crash.ll b/llvm/test/CodeGen/X86/crash.ll
index 16e3bb6e50aee..2f49a60a26f4d 100644
--- a/llvm/test/CodeGen/X86/crash.ll
+++ b/llvm/test/CodeGen/X86/crash.ll
@@ -115,9 +115,9 @@ do.body92:                                        ; preds = %if.then66
 ; Crash during XOR optimization.
 ; <rdar://problem/7869290>
 
-define void @test7() nounwind ssp {
+define void @test7(i1 %arg) nounwind ssp {
 entry:
-  br i1 undef, label %bb14, label %bb67
+  br i1 %arg, label %bb14, label %bb67
 
 bb14:
   %tmp0 = trunc i16 undef to i1
@@ -157,14 +157,14 @@ entry:
 ; shift of and.
 %struct.S0 = type { i8, [2 x i8], i8 }
 
-define void @func_59(i32 %p_63) noreturn nounwind {
+define void @func_59(i32 %p_63, i1 %arg) noreturn nounwind {
 entry:
   br label %for.body
 
 for.body:                                         ; preds = %for.inc44, %entry
   %p_63.addr.1 = phi i32 [ %p_63, %entry ], [ 0, %for.inc44 ]
   %l_74.0 = phi i32 [ 0, %entry ], [ %add46, %for.inc44 ]
-  br i1 undef, label %for.inc44, label %bb.nph81
+  br i1 %arg, label %for.inc44, label %bb.nph81
 
 bb.nph81:                                         ; preds = %for.body
   %tmp98 = add i32 %p_63.addr.1, 0
@@ -237,7 +237,7 @@ declare i64 @llvm.objectsize.i64.p0(ptr, i1) nounwind readnone
 %t20 = type { i32, i32 }
 %t21 = type { ptr }
 
-define void @_ZNK4llvm17MipsFrameLowering12emitPrologueERNS_15MachineFunctionE() ssp align 2 {
+define void @_ZNK4llvm17MipsFrameLowering12emitPrologueERNS_15MachineFunctionE(i1 %arg) ssp align 2 {
 bb:
   %tmp = load ptr, ptr undef, align 4
   %tmp3 = getelementptr inbounds %t9, ptr %tmp, i32 0, i32 0, i32 0, i32 0, i32 1
@@ -246,7 +246,7 @@ bb:
 bb4:                                              ; preds = %bb37, %bb
   %tmp5 = phi i96 [ undef, %bb ], [ %tmp38, %bb37 ]
   %tmp6 = phi i96 [ undef, %bb ], [ %tmp39, %bb37 ]
-  br i1 undef, label %bb34, label %bb7
+  br i1 %arg, label %bb34, label %bb7
 
 bb7:                                              ; preds = %bb4
   %tmp8 = load i32, ptr undef, align 4
@@ -292,7 +292,7 @@ bb33:                                             ; preds = %bb29
   unreachable
 
 bb34:                                             ; preds = %bb4
-  br i1 undef, label %bb36, label %bb35
+  br i1 %arg, label %bb36, label %bb35
 
 bb35:                                             ; preds = %bb34
   store ptr null, ptr %tmp3, align 4
@@ -319,7 +319,7 @@ declare void @llvm.lifetime.end.p0(i64, ptr nocapture) nounwind
 
 ; PR10463
 ; Spilling a virtual register with <undef> uses.
-define void @autogen_239_1000() {
+define void @autogen_239_1000(i1 %arg) {
 BB:
     %Shuff = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 undef>
     br label %CF
@@ -327,14 +327,14 @@ BB:
 CF:
     %B16 = frem <8 x double> zeroinitializer, %Shuff
     %E19 = extractelement <8 x double> %Shuff, i32 5
-    br i1 undef, label %CF, label %CF75
+    br i1 %arg, label %CF, label %CF75
 
 CF75:
-    br i1 undef, label %CF75, label %CF76
+    br i1 %arg, label %CF75, label %CF76
 
 CF76:
     store double %E19, ptr undef
-    br i1 undef, label %CF76, label %CF77
+    br i1 %arg, label %CF76, label %CF77
 
 CF77:
     %B55 = fmul <8 x double> %B16, undef
@@ -396,24 +396,24 @@ if.end:
 ; InstrEmitter::EmitSubregNode() may steal virtual registers from already
 ; emitted blocks when isCoalescableExtInstr points out the opportunity.
 ; Make sure kill flags are cleared on the newly global virtual register.
-define i64 @ov_read(ptr %vf, ptr nocapture %buffer, i32 %length, i32 %bigendianp, i32 %word, i32 %sgned, ptr %bitstream) nounwind uwtable ssp {
+define i64 @ov_read(ptr %vf, ptr nocapture %buffer, i32 %length, i32 %bigendianp, i32 %word, i32 %sgned, ptr %bitstream, i1 %arg) nounwind uwtable ssp {
 entry:
-  br i1 undef, label %return, label %while.body.preheader
+  br i1 %arg, label %return, label %while.body.preheader
 
 while.body.preheader:                             ; preds = %entry
-  br i1 undef, label %if.then3, label %if.end7
+  br i1 %arg, label %if.then3, label %if.end7
 
 if.then3:                                         ; preds = %while.body.preheader
   %0 = load i32, ptr undef, align 4
-  br i1 undef, label %land.lhs.true.i255, label %if.end7
+  br i1 %arg, label %land.lhs.true.i255, label %if.end7
 
 land.lhs.true.i255:                               ; preds = %if.then3
-  br i1 undef, label %if.then.i256, label %if.end7
+  br i1 %arg, label %if.then.i256, label %if.end7
 
 if.then.i256:                                     ; preds = %land.lhs.true.i255
   %sub.i = sub i32 0, %0
   %conv = sext i32 %sub.i to i64
-  br i1 undef, label %if.end7, label %while.end
+  br i1 %arg, label %if.end7, label %while.end
 
 if.end7:                                          ; preds = %if.then.i256, %land.lhs.true.i255, %if.then3, %while.body.preheader
   unreachable
@@ -486,12 +486,12 @@ declare void @fn3(...)
 ; When coalescing %1 and %2, the IMPLICIT_DEF instruction should be
 ; erased along with its value number.
 ;
-define void @rdar12474033() nounwind ssp {
+define void @rdar12474033(i1 %arg, i32 %arg2, i32 %arg3, i32 %arg4) nounwind ssp {
 bb:
-  br i1 undef, label %bb21, label %bb1
+  br i1 %arg, label %bb21, label %bb1
 
 bb1:                                              ; preds = %bb
-  switch i32 undef, label %bb10 [
+  switch i32 %arg2, label %bb10 [
     i32 4, label %bb2
     i32 1, label %bb9
     i32 5, label %bb3
@@ -503,7 +503,7 @@ bb2:                                              ; preds = %bb1
   unreachable
 
 bb3:                                              ; preds = %bb1, %bb1
-  br i1 undef, label %bb4, label %bb5
+  br i1 %arg, label %bb4, label %bb5
 
 bb4:                                              ; preds = %bb3
   unreachable
@@ -521,7 +521,7 @@ bb9:                                              ; preds = %bb1, %bb1
 bb10:                                             ; preds = %bb5, %bb1
   %tmp11 = phi i128 [ undef, %bb1 ], [ %tmp6, %bb5 ]
   %tmp12 = phi i128 [ 0, %bb1 ], [ %tmp8, %bb5 ]
-  switch i32 undef, label %bb21 [
+  switch i32 %arg3, label %bb21 [
     i32 2, label %bb18
     i32 3, label %bb13
     i32 5, label %bb16
@@ -530,7 +530,7 @@ bb10:                                             ; preds = %bb5, %bb1
   ]
 
 bb13:                                             ; preds = %bb10
-  br i1 undef, label %bb15, label %bb14
+  br i1 %arg, label %bb15, label %bb14
 
 bb14:                                             ; preds = %bb13
   br label %bb21
@@ -554,7 +554,7 @@ bb21:                                             ; preds = %bb18, %bb14, %bb10,
   %tmp23 = phi <4 x float> [ undef, %bb ], [ undef, %bb10 ], [ undef, %bb14 ], [ %tmp19, %bb18 ]
   store <4 x float> %tmp23, ptr undef, align 16
   store <4 x float> %tmp22, ptr undef, align 16
-  switch i32 undef, label %bb29 [
+  switch i32 %arg4, label %bb29 [
     i32 5, label %bb27
     i32 1, label %bb24
     i32 2, label %bb25
diff --git a/llvm/test/CodeGen/X86/domain-reassignment-test.ll b/llvm/test/CodeGen/X86/domain-reassignment-test.ll
index af7aca67c8fa4..77c1ef256cf09 100644
--- a/llvm/test/CodeGen/X86/domain-reassignment-test.ll
+++ b/llvm/test/CodeGen/X86/domain-reassignment-test.ll
@@ -3,7 +3,7 @@
 
 ; Check that the X86 domain reassignment pass doesn't introduce an illegal
 ; test instruction. See PR37396
-define void @japi1_foo2_34617() {
+define void @japi1_foo2_34617(i1 %arg) {
 pass2:
   br label %if5
 
@@ -27,7 +27,7 @@ if5:
   %tmp120 = and i1 %tmp118, %tmp119
   %tmp121 = zext i1 %tmp120 to i8
   %tmp122 = and i8 %b.055, %tmp121
-  br i1 undef, label %L174, label %if5
+  br i1 %arg, label %L174, label %if5
 
 L188:
   unreachable
diff --git a/llvm/test/CodeGen/X86/fast-isel-cmp-branch.ll b/llvm/test/CodeGen/X86/fast-isel-cmp-branch.ll
index 8d8d4fa699aae..4a5cddb30e031 100644
--- a/llvm/test/CodeGen/X86/fast-isel-cmp-branch.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-cmp-branch.ll
@@ -5,9 +5,9 @@
 ; The machine verifier will catch and complain about this case.
 ; CHECK-LABEL: baz
 ; CHECK: retq
-define void @baz() {
+define void @baz(i1 %arg) {
 entry:
-  br i1 undef, label %exit, label %exit
+  br i1 %arg, label %exit, label %exit
 
 exit:
   ret void
diff --git a/llvm/test/CodeGen/X86/fold-vector-shuffle-crash.ll b/llvm/test/CodeGen/X86/fold-vector-shuffle-crash.ll
index 95432380ced7a..55d9ea90682d6 100644
--- a/llvm/test/CodeGen/X86/fold-vector-shuffle-crash.ll
+++ b/llvm/test/CodeGen/X86/fold-vector-shuffle-crash.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=corei7
 
-define void @autogen_SD13708(i32) {
+define void @autogen_SD13708(i32, i1 %arg) {
 BB:
  %Shuff7 = shufflevector <8 x i32> zeroinitializer, <8 x i32> zeroinitializer, <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 undef, i32 2, i32 4, i32 undef>
  br label %CF
@@ -8,11 +8,11 @@ BB:
 CF:
  %Tr = trunc <8 x i64> zeroinitializer to <8 x i32>
  %Shuff20 = shufflevector <8 x i32> %Shuff7, <8 x i32> %Tr, <8 x i32> <i32 13, i32 15, i32 1, i32 3, i32 5, i32 7, i32 undef, i32 11>
- br i1 undef, label %CF, label %CF247
+ br i1 %arg, label %CF, label %CF247
 
 CF247:
  %I171 = insertelement <8 x i32> %Shuff20, i32 %0, i32 0
- br i1 undef, label %CF, label %CF247
+ br i1 %arg, label %CF, label %CF247
 }
 
 define void @autogen_SD13800(ptr, ptr, ptr, i32, i64, i8) {
diff --git a/llvm/test/CodeGen/X86/hoist-spill.ll b/llvm/test/CodeGen/X86/hoist-spill.ll
index d11b6666442bf..b51609c313b0b 100644
--- a/llvm/test/CodeGen/X86/hoist-spill.ll
+++ b/llvm/test/CodeGen/X86/hoist-spill.ll
@@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @d = external global ptr, align 8
 
 ; Function Attrs: norecurse noreturn nounwind uwtable
-define void @fn1(i32 %p1, i32 %p2, i64 %p3) {
+define void @fn1(i32 %p1, i32 %p2, i64 %p3, i1 %arg) {
 entry:
   %tmp = load ptr, ptr @d, align 8
   %tmp1 = load ptr, ptr @a, align 8
@@ -54,10 +54,10 @@ for.cond4.preheader:                              ; preds = %for.body, %for.cond
   br i1 %cmp528, label %for.inc14, label %for.body6.preheader
 
 for.body6.preheader:                              ; preds = %for.cond4.preheader
-  br i1 undef, label %for.body6, label %min.iters.checked
+  br i1 %arg, label %for.body6, label %min.iters.checked
 
 min.iters.checked:                                ; preds = %for.body6.preheader
-  br i1 undef, label %for.body6, label %vector.memcheck
+  br i1 %arg, label %for.body6, label %vector.memcheck
 
 vector.memcheck:                                  ; preds = %min.iters.checked
   %bound1 = icmp ule ptr undef, %scevgep41
@@ -85,10 +85,10 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %tmp16 = getelementptr inbounds i32, ptr %tmp1, i64 %offset.idx.1
   store <4 x i32> %wide.load.1, ptr %tmp16, align 4
   %index.next.3 = add i64 %index, 32
-  br i1 undef, label %middle.block, label %vector.body
+  br i1 %arg, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body, %vector.body.preheader.split
-  br i1 undef, label %for.inc14, label %for.body6
+  br i1 %arg, label %for.inc14, label %for.body6
 
 for.body.preheader:                               ; preds = %for.cond
   br label %for.body
@@ -98,7 +98,7 @@ for.body:                                         ; preds = %for.body, %for.body
   %add = add nsw i32 %k.127, 1
   %tmp18 = load i32, ptr undef, align 4
   store i32 %tmp18, ptr @b, align 4
-  br i1 undef, label %for.body, label %for.cond4.preheader
+  br i1 %arg, label %for.body, label %for.cond4.preheader
 
 for.body6:                                        ; preds = %for.body6, %middle.block, %vector.memcheck, %min.iters.checked, %for.body6.preheader
   %indvars.iv32 = phi i64 [ undef, %for.body6 ], [ %tmp12, %vector.memcheck ], [ %tmp12, %min.iters.checked ], [ %tmp12, %for.body6.preheader ], [ undef, %middle.block ]
diff --git a/llvm/test/CodeGen/X86/implicit-null-checks.mir b/llvm/test/CodeGen/X86/implicit-null-checks.mir
index 0077906b60181..c98019c09a1e6 100644
--- a/llvm/test/CodeGen/X86/implicit-null-checks.mir
+++ b/llvm/test/CodeGen/X86/implicit-null-checks.mir
@@ -5,15 +5,15 @@
   target triple = "x86_64-apple-macosx"
 
   ;; Positive test
-  define i32 @imp_null_check_with_bitwise_op_0(ptr %x, i32 %val) {
+  define i32 @imp_null_check_with_bitwise_op_0(ptr %x, i32 %val, i1 %arg) {
   entry:
-    br i1 undef, label %is_null, label %not_null, !make.implicit !0
+    br i1 %arg, label %is_null, label %not_null, !make.implicit !0
 
   is_null:
     ret i32 42
 
   not_null:
-    br i1 undef, label %ret_100, label %ret_200
+    br i1 %arg, label %ret_100, label %ret_200
 
   ret_100:
     ret i32 100
@@ -24,15 +24,15 @@
 
   ;; Negative test.  The regalloc is such that we cannot hoist the
   ;; instruction materializing 2200000 into $eax
-  define i32 @imp_null_check_with_bitwise_op_1(ptr %x, i32 %val, ptr %ptr) {
+  define i32 @imp_null_check_with_bitwise_op_1(ptr %x, i32 %val, ptr %ptr, i1 %arg) {
   entry:
-    br i1 undef, label %is_null, label %not_null, !make.implicit !0
+    br i1 %arg, label %is_null, label %not_null, !make.implicit !0
 
   is_null:
     ret i32 undef
 
   not_null:
-    br i1 undef, label %ret_100, label %ret_200
+    br i1 %arg, label %ret_100, label %ret_200
 
   ret_100:
     ret i32 100
@@ -43,15 +43,15 @@
 
   ;; Negative test: IR is identical to
   ;; @imp_null_check_with_bitwise_op_0 but MIR differs.
-  define i32 @imp_null_check_with_bitwise_op_2(ptr %x, i32 %val) {
+  define i32 @imp_null_check_with_bitwise_op_2(ptr %x, i32 %val, i1 %arg) {
   entry:
-    br i1 undef, label %is_null, label %not_null, !make.implicit !0
+    br i1 %arg, label %is_null, label %not_null, !make.implicit !0
 
   is_null:
     ret i32 42
 
   not_null:
-    br i1 undef, label %ret_100, label %ret_200
+    br i1 %arg, label %ret_100, label %ret_200
 
   ret_100:
     ret i32 100
@@ -62,15 +62,15 @@
 
   ;; Negative test: IR is identical to
   ;; @imp_null_check_with_bitwise_op_0 but MIR differs.
-  define i32 @imp_null_check_with_bitwise_op_3(ptr %x, i32 %val) {
+  define i32 @imp_null_check_with_bitwise_op_3(ptr %x, i32 %val, i1 %arg) {
   entry:
-    br i1 undef, label %is_null, label %not_null, !make.implicit !0
+    br i1 %arg, label %is_null, label %not_null, !make.implicit !0
 
   is_null:
     ret i32 42
 
   not_null:
-    br i1 undef, label %ret_100, label %ret_200
+    br i1 %arg, label %ret_100, label %ret_200
 
   ret_100:
     ret i32 100
@@ -80,15 +80,15 @@
   }
 
   ;; Positive test
-  define i32 @imp_null_check_with_bitwise_op_4(ptr %x, i32 %val) {
+  define i32 @imp_null_check_with_bitwise_op_4(ptr %x, i32 %val, i1 %arg) {
   entry:
-    br i1 undef, label %is_null, label %not_null, !make.implicit !0
+    br i1 %arg, label %is_null, label %not_null, !make.implicit !0
 
   is_null:
     ret i32 42
 
   not_null:
-    br i1 undef, label %ret_100, label %ret_200
+    br i1 %arg, label %ret_100, label %ret_200
 
   ret_100:
     ret i32 100
diff --git a/llvm/test/CodeGen/X86/interval-update-remat.ll b/llvm/test/CodeGen/X86/interval-update-remat.ll
index 44d3db3a29726..91fde2ba018b8 100644
--- a/llvm/test/CodeGen/X86/interval-update-remat.ll
+++ b/llvm/test/CodeGen/X86/interval-update-remat.ll
@@ -17,13 +17,13 @@ target triple = "i386-unknown-linux-gnu"
 @f = external global i16, align 2
 @.str = external unnamed_addr constant [12 x i8], align 1
 
-define void @fn1() {
+define void @fn1(i1 %arg) {
 entry:
   %tmp = load i64, ptr @b, align 8
   %or = or i64 0, 3299921317
   %and = and i64 %or, %tmp
   %tmp1 = load i32, ptr @d, align 4
-  br i1 undef, label %lor.rhs, label %lor.end
+  br i1 %arg, label %lor.rhs, label %lor.end
 
 lor.rhs:                                          ; preds = %entry
   %tobool3 = icmp ne i8 undef, 0
@@ -32,7 +32,7 @@ lor.rhs:                                          ; preds = %entry
 lor.end:                                          ; preds = %lor.rhs, %entry
   %lor.ext = zext i1 undef to i32
   %tmp2 = load i64, ptr @e, align 8
-  br i1 undef, label %lor.rhs5, label %lor.end7
+  br i1 %arg, label %lor.rhs5, label %lor.end7
 
 lor.rhs5:                                         ; preds = %lor.end
   br label %lor.end7
diff --git a/llvm/test/CodeGen/X86/jump_sign.ll b/llvm/test/CodeGen/X86/jump_sign.ll
index 9eaa65442a727..6dc0427b02f31 100644
--- a/llvm/test/CodeGen/X86/jump_sign.ll
+++ b/llvm/test/CodeGen/X86/jump_sign.ll
@@ -249,16 +249,16 @@ define void @func_o() nounwind uwtable {
 ; CHECK-NEXT:  .LBB12_7: # %if.else.i97
 entry:
   %0 = load i16, ptr undef, align 2
-  br i1 undef, label %if.then.i, label %if.end.i
+  br i1 poison, label %if.then.i, label %if.end.i
 
 if.then.i:                                        ; preds = %entry
   unreachable
 
 if.end.i:                                         ; preds = %entry
-  br i1 undef, label %sw.bb, label %sw.default
+  br i1 poison, label %sw.bb, label %sw.default
 
 sw.bb:                                            ; preds = %if.end.i
-  br i1 undef, label %if.then44, label %if.end29
+  br i1 poison, label %if.then44, label %if.end29
 
 if.end29:                                         ; preds = %sw.bb
   %1 = urem i16 %0, 10
@@ -267,7 +267,7 @@ if.end29:                                         ; preds = %sw.bb
   br i1 %cmp25, label %if.then44, label %sw.default
 
 sw.default:                                       ; preds = %if.end29, %if.end.i
-  br i1 undef, label %if.then.i96, label %if.else.i97
+  br i1 poison, label %if.then.i96, label %if.else.i97
 
 if.then.i96:                                      ; preds = %sw.default
   unreachable
@@ -277,7 +277,7 @@ if.else.i97:                                      ; preds = %sw.default
 
 if.then44:                                        ; preds = %if.end29, %sw.bb
   %aModeRefSel.1.ph = phi i16 [ %., %if.end29 ], [ 3, %sw.bb ]
-  br i1 undef, label %if.then.i103, label %if.else.i104
+  br i1 poison, label %if.then.i103, label %if.else.i104
 
 if.then.i103:                                     ; preds = %if.then44
   unreachable
@@ -420,4 +420,3 @@ if.end:
 }
 
 !1 = !{!"branch_weights", i32 2, i32 1}
-
diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce-crash.ll b/llvm/test/CodeGen/X86/loop-strength-reduce-crash.ll
index a00433391f15d..9cd755119e7a5 100644
--- a/llvm/test/CodeGen/X86/loop-strength-reduce-crash.ll
+++ b/llvm/test/CodeGen/X86/loop-strength-reduce-crash.ll
@@ -7,7 +7,7 @@
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.12.0"
 
-define void @foo() {
+define void @foo(i1 %arg) {
 entry:
   br label %for
 
@@ -17,7 +17,7 @@ for:
   store i32 %next, ptr undef, align 4
   %add = add i64 %0, 9223372036854775807
   %inc = add nsw i32 %next, 1
-  br i1 undef, label %exit, label %for
+  br i1 %arg, label %exit, label %for
 
 exit:
   store i64 %add, ptr undef
diff --git a/llvm/test/CodeGen/X86/lsr-crash-empty-uses.ll b/llvm/test/CodeGen/X86/lsr-crash-empty-uses.ll
index 552999fdba65c..cf434419bb978 100644
--- a/llvm/test/CodeGen/X86/lsr-crash-empty-uses.ll
+++ b/llvm/test/CodeGen/X86/lsr-crash-empty-uses.ll
@@ -3,7 +3,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; CHECK-LABEL: @hoge
-define void @hoge() {
+define void @hoge(i1 %arg) {
 bb:
   %tmp = sext i32 undef to i64
   %tmp3 = sub nsw i64 0, %tmp
@@ -21,7 +21,7 @@ bb7:                                              ; preds = %bb7, %bb4
   br i1 true, label %bb11, label %bb7
 
 bb11:                                             ; preds = %bb7
-  br i1 undef, label %bb20, label %bb12
+  br i1 %arg, label %bb20, label %bb12
 
 bb12:                                             ; preds = %bb11
   br label %bb13
diff --git a/llvm/test/CodeGen/X86/lsr-delayed-fold.ll b/llvm/test/CodeGen/X86/lsr-delayed-fold.ll
index efa9331cfcc40..a35015d09a4fc 100644
--- a/llvm/test/CodeGen/X86/lsr-delayed-fold.ll
+++ b/llvm/test/CodeGen/X86/lsr-delayed-fold.ll
@@ -30,7 +30,7 @@ bb24:                                             ; preds = %bb21, %bb11
 ; ScalarEvolution should be able to correctly expand the crazy addrec here.
 ; PR6914
 
-define void @int323() nounwind {
+define void @int323(i1 %arg) nounwind {
 entry:
   br label %for.cond
 
@@ -38,7 +38,7 @@ for.cond:                                         ; preds = %lbl_264, %for.inc,
   %g_263.tmp.1 = phi i8 [ undef, %entry ], [ %g_263.tmp.1, %for.cond ]
   %p_95.addr.0 = phi i8 [ 0, %entry ], [ %add, %for.cond ]
   %add = add i8 %p_95.addr.0, 1                   ; <i8> [#uses=1]
-  br i1 undef, label %for.cond, label %lbl_264
+  br i1 %arg, label %for.cond, label %lbl_264
 
 lbl_264:                                          ; preds = %if.end, %lbl_264.preheader
   %g_263.tmp.0 = phi i8 [ %g_263.tmp.1, %for.cond ] ; <i8> [#uses=1]
@@ -56,13 +56,13 @@ lbl_264:                                          ; preds = %if.end, %lbl_264.pr
 
 %struct.Bu = type { i32, i32, i32 }
 
-define void @_Z3fooP2Bui(ptr nocapture %bu) {
+define void @_Z3fooP2Bui(ptr nocapture %bu, i1 %arg) {
 entry:
   br label %for.body
 
 for.body:                                         ; preds = %for.inc131, %entry
   %indvar = phi i64 [ %indvar.next, %for.inc131 ], [ 0, %entry ] ; <i64> [#uses=3]
-  br i1 undef, label %for.inc131, label %lor.lhs.false
+  br i1 %arg, label %for.inc131, label %lor.lhs.false
 
 lor.lhs.false:                                    ; preds = %for.body
   %tmp15 = add i64 %indvar, 1                     ; <i64> [#uses=1]
@@ -123,11 +123,11 @@ for.body123:                                      ; preds = %for.body123, %lor.l
   %add129 = add i32 %mul, %j.03                   ; <i32> [#uses=1]
   tail call void undef(i32 %add129)
   %inc = add nsw i32 %j.03, 1                     ; <i32> [#uses=1]
-  br i1 undef, label %for.inc131, label %for.body123
+  br i1 %arg, label %for.inc131, label %for.body123
 
 for.inc131:                                       ; preds = %for.body123, %for.body
   %indvar.next = add i64 %indvar, 1               ; <i64> [#uses=1]
-  br i1 undef, label %for.end134, label %for.body
+  br i1 %arg, label %for.end134, label %for.body
 
 for.end134:                                       ; preds = %for.inc131
   ret void
@@ -138,14 +138,14 @@ for.end134:                                       ; preds = %for.inc131
 ; require insert point adjustment.
 ; PR7306
 
-define fastcc i32 @GetOptimum() nounwind {
+define fastcc i32 @GetOptimum(i1 %arg) nounwind {
 bb:
   br label %bb1
 
 bb1:                                              ; preds = %bb1, %bb
   %t = phi i32 [ 0, %bb ], [ %t2, %bb1 ]      ; <i32> [#uses=1]
   %t2 = add i32 %t, undef                     ; <i32> [#uses=3]
-  br i1 undef, label %bb1, label %bb3
+  br i1 %arg, label %bb1, label %bb3
 
 bb3:                                              ; preds = %bb1
   %t4 = add i32 undef, -1                       ; <i32> [#uses=1]
@@ -155,13 +155,13 @@ bb5:                                              ; preds = %bb16, %bb3
   %t6 = phi i32 [ %t17, %bb16 ], [ 0, %bb3 ]  ; <i32> [#uses=3]
   %t7 = add i32 undef, %t6                    ; <i32> [#uses=2]
   %t8 = add i32 %t4, %t6                    ; <i32> [#uses=1]
-  br i1 undef, label %bb9, label %bb10
+  br i1 %arg, label %bb9, label %bb10
 
 bb9:                                              ; preds = %bb5
   br label %bb10
 
 bb10:                                             ; preds = %bb9, %bb5
-  br i1 undef, label %bb11, label %bb16
+  br i1 %arg, label %bb11, label %bb16
 
 bb11:                                             ; preds = %bb10
   %t12 = icmp ugt i32 %t7, %t2              ; <i1> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll
index 5828f06bf1c39..41eae3ca03c2b 100644
--- a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll
+++ b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll
@@ -52,7 +52,7 @@ define void @PR24199(i32 %a0) {
 entry:
   %i = alloca %struct.A, align 8
   %tobool = icmp ne i32 %a0, 0
-  br i1 undef, label %if.end, label %if.then
+  br i1 poison, label %if.end, label %if.then
 
 if.then:
   br label %if.end
@@ -96,5 +96,3 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 !4 = !DIExpression()
 !5 = !DILocalVariable(name: "this", arg: 1, scope: !3, flags: DIFlagArtificial | DIFlagObjectPointer)
 !6 = !DILocation(line: 0, scope: !3)
-
-
diff --git a/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll b/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
index a10fbc10bf284..3dba5eb15d67f 100644
--- a/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
+++ b/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
@@ -21,10 +21,10 @@ define void @testfn(ptr nocapture %p) {
 
 ; CHECK-LABEL: testfn_scalar
 ; CHECK: retq
-define void @testfn_scalar(ptr nocapture %j) local_unnamed_addr #0 align 2 {
+define void @testfn_scalar(ptr nocapture %j, i1 %arg) local_unnamed_addr #0 align 2 {
 entry:
   %0 = bitcast i64 undef to <2 x float>
-  br i1 undef, label %if.end, label %if.then
+  br i1 %arg, label %if.end, label %if.then
 
 if.then:                                          ; preds = %entry
   unreachable
diff --git a/llvm/test/CodeGen/X86/misched-crash.ll b/llvm/test/CodeGen/X86/misched-crash.ll
index 98818d9a102fe..a421faba95f7e 100644
--- a/llvm/test/CodeGen/X86/misched-crash.ll
+++ b/llvm/test/CodeGen/X86/misched-crash.ll
@@ -4,7 +4,7 @@ target triple = "x86_64-apple-macosx10"
 
 ; This function contains a cmp instruction with two users.
 ; Hoisting the last use requires trimming the EFLAGS live range to the second.
-define void @rdar13353090(ptr %plane, i64 %_x1, i64 %_x2) {
+define void @rdar13353090(ptr %plane, i64 %_x1, i64 %_x2, i1 %arg) {
 entry:
   %cmp = icmp ult i64 %_x1, %_x2
   %cond = select i1 %cmp, i64 %_x1, i64 %_x2
@@ -33,7 +33,7 @@ for.body34.i:                                     ; preds = %for.inc39.i, %if.th
 
 for.inc39.i:                                      ; preds = %for.body34.i
   %inc41.i = add i64 %index.178.i, 1
-  br i1 undef, label %return, label %for.body34.i
+  br i1 %arg, label %return, label %for.body34.i
 
 return:                                           ; preds = %for.inc39.i, %for.body34.i, %land.lhs.true21, %entry
   ret void
diff --git a/llvm/test/CodeGen/X86/pr10475.ll b/llvm/test/CodeGen/X86/pr10475.ll
index 4dd5aab499ca8..4275dc262c378 100644
--- a/llvm/test/CodeGen/X86/pr10475.ll
+++ b/llvm/test/CodeGen/X86/pr10475.ll
@@ -2,19 +2,19 @@
 
 ; No check in a crash test
 
-define void @autogen_262380_1000() {
+define void @autogen_262380_1000(i1 %arg) {
 BB:
   br label %CF79
 
 CF79:                                             ; preds = %CF79, %BB
-  br i1 undef, label %CF79, label %CF84.critedge.critedge
+  br i1 %arg, label %CF79, label %CF84.critedge.critedge
 
 CF84.critedge.critedge:                           ; preds = %CF79
   %L35 = load <8 x i32>, ptr undef
   br label %CF85
 
 CF85:                                             ; preds = %CF85, %CF84.critedge.critedge
-  br i1 undef, label %CF85, label %CF86
+  br i1 %arg, label %CF85, label %CF86
 
 CF86:                                             ; preds = %CF86, %CF85
   %B61 = sub <8 x i32> %L35, zeroinitializer
@@ -23,7 +23,7 @@ CF86:                                             ; preds = %CF86, %CF85
   br i1 %E73, label %CF86, label %CF87
 
 CF87:                                             ; preds = %CF87, %CF86
-  br i1 undef, label %CF87, label %CF88
+  br i1 %arg, label %CF87, label %CF88
 
 CF88:                                             ; preds = %CF87
   ret void
diff --git a/llvm/test/CodeGen/X86/pr11998.ll b/llvm/test/CodeGen/X86/pr11998.ll
index caaf2710fba8c..4b93c20e7c236 100644
--- a/llvm/test/CodeGen/X86/pr11998.ll
+++ b/llvm/test/CodeGen/X86/pr11998.ll
@@ -1,13 +1,13 @@
 ; RUN: llc < %s -mcpu=corei7-avx -mtriple=x86_64-- -mattr=+avx
 
-define void @autogen_51367_5000(i8) {
+define void @autogen_51367_5000(i8, i1 %arg) {
 BB:
   %B = srem i8 55, %0
   %B9 = shl i8 %B, %B
   br label %CF
 
 CF:                                               ; preds = %CF, %BB
-  br i1 undef, label %CF, label %CF403
+  br i1 %arg, label %CF, label %CF403
 
 CF403:                                            ; preds = %CF403, %CF
   %S44 = icmp eq i8 %B9, %0
diff --git a/llvm/test/CodeGen/X86/pr32108.ll b/llvm/test/CodeGen/X86/pr32108.ll
index 32f8a7657a3f1..a50b9a676ae2e 100644
--- a/llvm/test/CodeGen/X86/pr32108.ll
+++ b/llvm/test/CodeGen/X86/pr32108.ll
@@ -13,7 +13,7 @@ BB:
   br label %CF243
 
 CF243:                                            ; preds = %CF243, %BB
-  br i1 undef, label %CF243, label %CF257
+  br i1 poison, label %CF243, label %CF257
 
 CF257:                                            ; preds = %CF243
   %Shuff144 = shufflevector <4 x i1> undef, <4 x i1> %Cmp45, <4 x i32> <i32 undef, i32 undef, i32 5, i32 undef>
diff --git a/llvm/test/CodeGen/X86/pr50254.ll b/llvm/test/CodeGen/X86/pr50254.ll
index 01d261a3fd4b6..95b7ae5e3e025 100644
--- a/llvm/test/CodeGen/X86/pr50254.ll
+++ b/llvm/test/CodeGen/X86/pr50254.ll
@@ -37,7 +37,7 @@ entry:
   br label %for.body
 
 for.body:                                         ; preds = %entry
-  br i1 undef, label %for.end, label %for.body.1
+  br i1 poison, label %for.end, label %for.body.1
 
 for.end:                                          ; preds = %for.body
   store i16 %xor1, ptr @d.e, align 4
diff --git a/llvm/test/CodeGen/X86/pr57673.ll b/llvm/test/CodeGen/X86/pr57673.ll
index 4ca8ae91f9e6f..779978b90349e 100644
--- a/llvm/test/CodeGen/X86/pr57673.ll
+++ b/llvm/test/CodeGen/X86/pr57673.ll
@@ -100,7 +100,7 @@ bb_entry:
   br label %bb_8
 
 bb_8:                                             ; preds = %bb_last, %bb_entry
-  br i1 undef, label %bb_last, label %bb_mid
+  br i1 poison, label %bb_last, label %bb_mid
 
 bb_mid:                                           ; preds = %bb_8
   %i4 = getelementptr inbounds %t10, ptr %i1, i64 0, i32 1, i64 32
diff --git a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
index beb42f55b709c..47e5079e9c363 100644
--- a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
+++ b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -331,13 +331,13 @@ if.end:
   ]
 
 if.then4:
-  br i1 undef, label %SyTime.exit, label %if.then.i
+  br i1 poison, label %SyTime.exit, label %if.then.i
 
 if.then.i:
   unreachable
 
 SyTime.exit:
-  br i1 undef, label %SyTime.exit2681, label %if.then.i2673
+  br i1 poison, label %SyTime.exit2681, label %if.then.i2673
 
 if.then.i2673:
   unreachable
@@ -349,7 +349,7 @@ land.lhs.true14:
   unreachable
 
 if.end25:
-  br i1 undef, label %SyTime.exit2720, label %if.then.i2712
+  br i1 poison, label %SyTime.exit2720, label %if.then.i2712
 
 if.then.i2712:
   unreachable
@@ -406,7 +406,7 @@ do.end:
   %mul167 = shl i32 %rep.6, 2
   %rep.8 = select i1 %cmp164, i32 %mul167, i32 %rep.6
   %..ch.19 = select i1 false, i32 2, i32 0
-  br i1 undef, label %while.body200, label %while.end1465
+  br i1 poison, label %while.body200, label %while.end1465
 
 while.body200:
   %dec3386.in = phi i32 [ %dec3386, %while.cond197.backedge ], [ %rep.8, %do.end ]
@@ -444,7 +444,7 @@ while.cond1037.preheader:
   br i1 %cmp10393273, label %if.end1070, label %land.rhs1041
 
 while.cond635.preheader:
-  br i1 undef, label %for.body643.us, label %while.cond661
+  br i1 poison, label %for.body643.us, label %while.cond661
 
 for.body643.us:
   br label %for.body643.us
@@ -488,7 +488,7 @@ land.rhs485:
   br i1 %isascii.i.i27763151, label %cond.true.i.i2780, label %cond.false.i.i2782
 
 cond.true.i.i2780:
-  br i1 undef, label %land.lhs.true490, label %lor.rhs500
+  br i1 poison, label %land.lhs.true490, label %lor.rhs500
 
 cond.false.i.i2782:
   unreachable
@@ -499,10 +499,10 @@ land.lhs.true490:
 lor.rhs500:
   ; Make sure spill is hoisted to a cold preheader in outside loop.
   %call3.i.i2792 = call i32 @__maskrune(i32 undef, i64 256)
-  br i1 undef, label %land.lhs.true504, label %do.body479.backedge
+  br i1 poison, label %land.lhs.true504, label %do.body479.backedge
 
 land.lhs.true504:
-  br i1 undef, label %do.body479.backedge, label %if.end517
+  br i1 poison, label %do.body479.backedge, label %if.end517
 
 do.body479.backedge:
   %incdec.ptr480 = getelementptr i8, ptr %incdec.ptr4803316, i64 1
@@ -531,10 +531,10 @@ for.cond534:
   br i1 %cmp536, label %for.cond542.preheader, label %for.cond534
 
 for.cond542.preheader:
-  br i1 undef, label %for.body545, label %for.end552
+  br i1 poison, label %for.body545, label %for.end552
 
 for.body545:
-  br i1 undef, label %for.end552, label %for.body545
+  br i1 poison, label %for.end552, label %for.body545
 
 for.end552:
   %s.2.lcssa = phi ptr [ undef, %for.cond542.preheader ], [ %q.4, %for.body545 ]
@@ -554,7 +554,7 @@ while.cond864:
   br label %while.cond864
 
 sw.bb956:
-  br i1 undef, label %if.then959, label %while.cond197.backedge
+  br i1 poison, label %if.then959, label %while.cond197.backedge
 
 if.then959:
   br label %while.cond962
@@ -600,7 +600,7 @@ while.end1465:
   ]
 
 for.cond1480.preheader:
-  br i1 undef, label %for.body1606.lr.ph, label %for.end1609
+  br i1 poison, label %for.body1606.lr.ph, label %for.end1609
 
 if.then1477:
   %p.1.lcssa3539 = phi ptr [ null, %while.end1465 ], [ null, %while.end1465 ], [ null, %while.end1465 ], [ null, %while.end1465 ], [ %line, %while.body200 ]
@@ -614,7 +614,7 @@ for.body1606.lr.ph:
   br label %for.end1609
 
 for.end1609:
-  br i1 undef, label %for.cond1659.preheader, label %land.lhs.true1614
+  br i1 poison, label %for.cond1659.preheader, label %land.lhs.true1614
 
 land.lhs.true1614:
   br label %for.cond1659.preheader
@@ -631,13 +631,13 @@ while.body1703.lr.ph:
   unreachable
 
 while.cond1683.preheader:
-  br i1 undef, label %while.body1691, label %while.end1693
+  br i1 poison, label %while.body1691, label %while.end1693
 
 while.body1679:
   %oldc.43406 = phi i32 [ %inc, %syEchoch.exit3070 ], [ %oldc.1.lcssa, %for.body1664.lr.ph ]
   %3 = load ptr, ptr %echo.i3101, align 8, !tbaa !6
   %call.i3062 = call i32 @fileno(ptr %3)
-  br i1 undef, label %if.then.i3069, label %syEchoch.exit3070
+  br i1 poison, label %if.then.i3069, label %syEchoch.exit3070
 
 if.then.i3069:
   br label %syEchoch.exit3070
diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll
index c9edd3f3e9048..cd3d481107723 100644
--- a/llvm/test/CodeGen/X86/shift-combine.ll
+++ b/llvm/test/CodeGen/X86/shift-combine.ll
@@ -408,7 +408,7 @@ define dso_local void @PR42880(i32 %t0) {
   %x = ptrtoint ptr %add.ptr.i94 to i32
   %sub2 = sub i32 %x, 0
   %div = sdiv exact i32 %sub2, 24
-  br i1 undef, label %if, label %then
+  br i1 poison, label %if, label %then
 
 then:
   %t1 = xor i32 %div, -1
diff --git a/llvm/test/CodeGen/X86/shuffle-combine-crash.ll b/llvm/test/CodeGen/X86/shuffle-combine-crash.ll
index e10e3dd1cd925..962b833ad9a1d 100644
--- a/llvm/test/CodeGen/X86/shuffle-combine-crash.ll
+++ b/llvm/test/CodeGen/X86/shuffle-combine-crash.ll
@@ -28,7 +28,7 @@ define void @sample_test() {
 ; CHECK-NEXT:    movd %xmm0, (%rax)
 ; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    retq
-  br i1 undef, label %5, label %1
+  br i1 poison, label %5, label %1
 
 ; <label>:1                                       ; preds = %0
   %2 = load <4 x i8>, ptr undef
@@ -40,4 +40,3 @@ define void @sample_test() {
 ; <label>:5                                       ; preds = %1, %0
   ret void
 }
-
diff --git a/llvm/test/CodeGen/X86/stackmap.ll b/llvm/test/CodeGen/X86/stackmap.ll
index 33180a7db893d..72406aaa4efa8 100644
--- a/llvm/test/CodeGen/X86/stackmap.ll
+++ b/llvm/test/CodeGen/X86/stackmap.ll
@@ -379,23 +379,23 @@ entry:
 ; CHECK-NEXT:   .short 6
 ; CHECK-NEXT:   .short 0
 ; CHECK-NEXT:   .long
-define void @spillSubReg(i64 %arg) #0 {
+define void @spillSubReg(i64 %arg, i1 %arg2) #0 {
 bb:
-  br i1 undef, label %bb1, label %bb2
+  br i1 %arg2, label %bb1, label %bb2
 
 bb1:
   unreachable
 
 bb2:
   %tmp = load i64, ptr inttoptr (i64 140685446136880 to ptr)
-  br i1 undef, label %bb16, label %bb17
+  br i1 %arg2, label %bb16, label %bb17
 
 bb16:
   unreachable
 
 bb17:
   %tmp32 = trunc i64 %tmp to i32
-  br i1 undef, label %bb60, label %bb61
+  br i1 %arg2, label %bb60, label %bb61
 
 bb60:
   tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
diff --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll
index 1489b0295e935..77b1ac094cea5 100644
--- a/llvm/test/CodeGen/X86/swifterror.ll
+++ b/llvm/test/CodeGen/X86/swifterror.ll
@@ -1014,7 +1014,7 @@ define void @swifterror_isel(ptr) {
 ; CHECK-i386-NEXT:    retl
 entry:
   %swifterror = alloca swifterror ptr, align 8
-  br i1 undef, label %5, label %1
+  br i1 poison, label %5, label %1
 
   %2 = phi i16 [ %4, %1 ], [ undef, %entry ]
   %3 = call i1 undef(i16 %2, ptr swiftself %0, ptr nocapture swifterror %swifterror)
diff --git a/llvm/test/CodeGen/X86/switch.ll b/llvm/test/CodeGen/X86/switch.ll
index 629ba48fcae6b..c75819c2fd2c5 100644
--- a/llvm/test/CodeGen/X86/switch.ll
+++ b/llvm/test/CodeGen/X86/switch.ll
@@ -2563,7 +2563,7 @@ define i32 @pr27135(i32 %i) {
 ; NOOPT-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; NOOPT-NEXT:    retq
 entry:
-  br i1 undef, label %sw, label %end
+  br i1 poison, label %sw, label %end
 sw:
   switch i32 %i, label %end [
     i32 99,  label %sw.bb
diff --git a/llvm/test/CodeGen/X86/tail-merge-unreachable.ll b/llvm/test/CodeGen/X86/tail-merge-unreachable.ll
index ce5613f523095..9afdabd4ce13c 100644
--- a/llvm/test/CodeGen/X86/tail-merge-unreachable.ll
+++ b/llvm/test/CodeGen/X86/tail-merge-unreachable.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -verify-machineinstrs | FileCheck %s
 
-define i32 @tail_merge_unreachable(i32 %i) {
+define i32 @tail_merge_unreachable(i32 %i, i1 %arg) {
 entry:
-  br i1 undef, label %sw, label %end
+  br i1 %arg, label %sw, label %end
 sw:
   switch i32 %i, label %end [
     i32 99,  label %sw.bb
diff --git a/llvm/test/CodeGen/X86/unreachable-loop-sinking.ll b/llvm/test/CodeGen/X86/unreachable-loop-sinking.ll
index d784425d76d3c..b09e2024db78a 100644
--- a/llvm/test/CodeGen/X86/unreachable-loop-sinking.ll
+++ b/llvm/test/CodeGen/X86/unreachable-loop-sinking.ll
@@ -7,9 +7,9 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define double @fn1(ptr %arg, i64 %arg1) {
+define double @fn1(ptr %arg, i64 %arg1, i1 %arg2) {
 Entry:
-  br i1 undef, label %Body, label %Exit
+  br i1 %arg2, label %Body, label %Exit
 
 Exit:                                             ; preds = %Brancher7, %Entry
   ret double undef
diff --git a/llvm/test/CodeGen/X86/update-terminator.mir b/llvm/test/CodeGen/X86/update-terminator.mir
index d26f797507716..ff5df9ad8885a 100644
--- a/llvm/test/CodeGen/X86/update-terminator.mir
+++ b/llvm/test/CodeGen/X86/update-terminator.mir
@@ -10,14 +10,14 @@
   declare void @dummy3()
 
   ; Function Attrs: nounwind
-  define void @f2() {
-    br i1 undef, label %bb1, label %bb3
+  define void @f2(i1 %arg) {
+    br i1 %arg, label %bb1, label %bb3
 
   bb1:
     call void @dummy1()
     call void @dummy1()
     call void @dummy1()
-    br i1 undef, label %bb2, label %bb2
+    br i1 %arg, label %bb2, label %bb2
 
   bb2:
     call void @dummy2()
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index f0b70ae26b1f0..4125d78783719 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -190,7 +190,7 @@ define i64 @PR55050() {
 entry:
   %i275 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> undef, <16 x i8> zeroinitializer)
   %i277 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> undef, <16 x i8> zeroinitializer)
-  br i1 undef, label %exit, label %if
+  br i1 poison, label %exit, label %if
 
 if:
   %i298 = bitcast <2 x i64> %i275 to <4 x i32>
diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
index fe7459ea45e14..928f29b7b1889 100644
--- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -868,7 +868,7 @@ define void @infiniteloop() {
 ; DISABLE-NEXT:    popq %rbp
 ; DISABLE-NEXT:    retq
 entry:
-  br i1 undef, label %if.then, label %if.end
+  br i1 poison, label %if.then, label %if.end
 
 if.then:
   %ptr = alloca i32, i32 4
@@ -983,7 +983,7 @@ define void @infiniteloop2() {
 ; DISABLE-NEXT:    popq %rbp
 ; DISABLE-NEXT:    retq
 entry:
-  br i1 undef, label %if.then, label %if.end
+  br i1 poison, label %if.then, label %if.end
 
 if.then:
   %ptr = alloca i32, i32 4
@@ -994,7 +994,7 @@ for.body:                                         ; preds = %for.body, %entry
   %call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"()
   %add = add nsw i32 %call, %sum.03
   store i32 %add, ptr %ptr
-  br i1 undef, label %body1, label %body2
+  br i1 poison, label %body1, label %body2
 
 body1:
   tail call void asm sideeffect "nop", "~{ebx}"()
@@ -1074,10 +1074,10 @@ define void @infiniteloop3() {
 ; DISABLE-NEXT:  LBB12_7: ## %end
 ; DISABLE-NEXT:    retq
 entry:
-  br i1 undef, label %loop2a, label %body
+  br i1 poison, label %loop2a, label %body
 
 body:                                             ; preds = %entry
-  br i1 undef, label %loop2a, label %end
+  br i1 poison, label %loop2a, label %end
 
 loop1:                                            ; preds = %loop2a, %loop2b
   %var.phi = phi ptr [ %next.phi, %loop2b ], [ %var, %loop2a ]

From 852458129849f4c2614294de911a7b51264d5cb1 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 6 Jan 2025 09:16:26 +0000
Subject: [PATCH 03/49] [libclc] Add Maintainers.md for libclc (#118309)

This adds a Maintainers.md files to libclc. Recently I needed to find a
libclc maintainer and I had no idea there was one listed in llvm/
instead of in libclc/.
---
 libclc/Maintainers.md | 17 +++++++++++++++++
 llvm/Maintainers.md   |  8 ++------
 2 files changed, 19 insertions(+), 6 deletions(-)
 create mode 100644 libclc/Maintainers.md

diff --git a/libclc/Maintainers.md b/libclc/Maintainers.md
new file mode 100644
index 0000000000000..ac869b6945db5
--- /dev/null
+++ b/libclc/Maintainers.md
@@ -0,0 +1,17 @@
+# libclc Maintainers
+
+This file is a list of the
+[maintainers](https://llvm.org/docs/DeveloperPolicy.html#maintainers) for
+libclc.
+
+## Current Maintainers
+
+The following people are the active maintainers for the project. Please reach
+out to them for code reviews, questions about their area of expertise, or other
+assistance.
+
+Fraser Cormack \
+fraser@codeplay.com (email), [frasercrmck](https://github.com/frasercrmck) (GitHub)
+
+Tom Stellard \
+tstellar@redhat.com (email), [tstellar](https://github.com/tstellar) (GitHub)
diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index fca00ca12f401..7ea33efb0a97f 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -414,7 +414,6 @@ gkistanova@gmail.com (email), [gkistanova](https://github.com/gkistanova) (GitHu
 ### Other subprojects
 
 Some subprojects maintain their own list of per-component maintainers.
-Others only have a lead maintainer listed here.
 
 [Bolt maintainers](https://github.com/llvm/llvm-project/blob/main/bolt/Maintainers.txt)
 
@@ -428,6 +427,8 @@ Others only have a lead maintainer listed here.
 
 [libc++ maintainers](https://github.com/llvm/llvm-project/blob/main/libcxx/Maintainers.md)
 
+[libclc maintainers](https://github.com/llvm/llvm-project/blob/main/libclc/Maintainers.md)
+
 [LLD maintainers](https://github.com/llvm/llvm-project/blob/main/lld/Maintainers.md)
 
 [LLDB maintainers](https://github.com/llvm/llvm-project/blob/main/lldb/Maintainers.rst)
@@ -436,11 +437,6 @@ Others only have a lead maintainer listed here.
 
 [Polly maintainers](https://github.com/llvm/llvm-project/blob/main/polly/Maintainers.md)
 
-#### libclc
-
-Tom Stellard \
-tstellar@redhat.com (email), [tstellar](https://github.com/tstellar) (GitHub)
-
 ## Inactive Maintainers
 
 The following people have graciously spent time performing maintainer

From 3a0b306edfb8b5d5937a07a14600c781ef6fa9e3 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 6 Jan 2025 09:17:25 +0000
Subject: [PATCH 04/49] [lldb][Docs] Add equivalents of GDB's "skip" to command
 map (#120740)

https://sourceware.org/gdb/current/onlinedocs/gdb.html/Skipping-Over-Functions-and-Files.html

We can't emulate all the features of that command but we can skip a
function by name with some extra steps.

As far as I know this only matches function name unlike GDB that can
filter on file and line and so on:
```
target.process.thread.step-avoid-regexp -- A regular expression defining functions step-in won't stop in.
```
It's likely it's got some corner cases that don't work, maybe inlining,
but it doesn't seem worth going into it here.

I don't think we can chain lldb interpreter commands, so I have shown
the steps separately.

I have also mentioned `thread step-in` and its alias `sif`. Which were
new to me too.
---
 lldb/docs/use/map.rst | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/lldb/docs/use/map.rst b/lldb/docs/use/map.rst
index fe9c3f53022fa..ed285b2d1f6e9 100644
--- a/lldb/docs/use/map.rst
+++ b/lldb/docs/use/map.rst
@@ -235,6 +235,38 @@ Do a source level single step in the currently selected thread
   (lldb) step
   (lldb) s
 
+Ignore a function when doing a source level single step in
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: shell
+
+  (gdb) skip abc
+  Function abc will be skipped when stepping.
+
+.. code-block:: shell
+
+  (lldb) settings show target.process.thread.step-avoid-regexp
+  target.process.thread.step-avoid-regexp (regex) = ^std::
+  (lldb) settings set target.process.thread.step-avoid-regexp ^std::|^abc
+
+You can ignore a function once using:
+
+.. code-block:: shell
+
+  (lldb) thread step-in -r ^abc
+
+Or you can do the opposite, only step into functions matching a certain name:
+
+.. code-block:: shell
+
+  # Step in if abc is a substring of the function name.
+  (lldb) sif abc
+  # Which is equivalent to:
+  (lldb) thread step-in -t abc
+
+``thread step-in`` has more options which cover some of ``skip``'s other
+features. See ``help thread step-in`` for details.
+
 Do a source level single step over in the currently selected thread
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

From a3d1239e5bbfc336ecf6a020b41ab250f3eafb1a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 6 Jan 2025 10:17:05 +0100
Subject: [PATCH 05/49] [LLVM] Fix formatting mistakes in Maintainers.md (NFC)

---
 llvm/Maintainers.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index 7ea33efb0a97f..bce7444161efb 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -76,7 +76,7 @@ flo@fhahn.com (email), [fhahn](https://github.com/fhahn) (GitHub)
 #### SandboxVectorizer
 
 Vasileios Porpodas \
-vporpodas@google.com (email), [vporpo](https://github.com/vporpo) (GitHub)
+vporpodas@google.com (email), [vporpo](https://github.com/vporpo) (GitHub) \
 Jorge Gorbe Moya \
 jgorbe@google.com (email), [slackito](https://github.com/slackito) (GitHub)
 
@@ -233,11 +233,11 @@ anton@korobeynikov.info (email), [asl](https://github.com/asl) (GitHub)
 #### NVPTX backend
 
 Justin Holewinski \
-jholewinski@nvidia.com (email), [jholewinski](https://github.com/jholewinski) (GitHub)
+jholewinski@nvidia.com (email), [jholewinski](https://github.com/jholewinski) (GitHub) \
 Artem Belevich \
-tra@google.com (email), [Artem-B](https://github.com/Artem-B) (GitHub)
+tra@google.com (email), [Artem-B](https://github.com/Artem-B) (GitHub) \
 Alex MacLean \
-amaclean@nvidia.com (email), [AlexMaclean](https://github.com/AlexMaclean) (GitHub)
+amaclean@nvidia.com (email), [AlexMaclean](https://github.com/AlexMaclean) (GitHub) \
 Justin Fargnoli \
 jfargnoli@nvidia.com (email), [justinfargnoli](https://github.com/justinfargnoli) (GitHub)
 
@@ -339,7 +339,7 @@ lhames@gmail.com (email), [lhames](https://github.com/lhames) (GitHub)
 #### SandboxIR
 
 Vasileios Porpodas \
-vporpodas@google.com (email), [vporpo](https://github.com/vporpo) (GitHub)
+vporpodas@google.com (email), [vporpo](https://github.com/vporpo) (GitHub) \
 Jorge Gorbe Moya \
 jgorbe@google.com (email), [slackito](https://github.com/slackito) (GitHub)
 

From 5c7e77f2149856152a6d63749908f913117fa4bb Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 6 Jan 2025 10:18:26 +0100
Subject: [PATCH 06/49] [LLVM] Update AliasAnalysis maintainers (#120447)

Currently hfinkel is listed as the AliasAnalysis maintainer, but I
believe he hasn't been actively working on LLVM in the last couple of
years, so I'd like to update this information.

I'd like to nominate fhahn and myself as the new maintainers for AA.
While here, I'd also like to nominate alinas as the maintainer for
MemorySSA.
---
 llvm/Maintainers.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index bce7444161efb..7acca062f42a0 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -23,8 +23,10 @@ llvm@npopov.com, npopov@redhat.com (email), [nikic](https://github.com/nikic) (G
 
 #### AliasAnalysis
 
-Hal Finkel \
-hfinkel@anl.gov (email), [hfinkel](https://github.com/hfinkel) (GitHub)
+Nikita Popov \
+llvm@npopov.com, npopov@redhat.com (email), [nikic](https://github.com/nikic) (GitHub), nikic (Discourse) \
+Florian Hahn \
+flo@fhahn.com (email), [fhahn](https://github.com/fhahn) (GitHub)
 
 #### Attributor, OpenMPOpt
 
@@ -73,6 +75,11 @@ quentin.colombet@gmail.com (email), [qcolombet](https://github.com/qcolombet) (G
 Florian Hahn \
 flo@fhahn.com (email), [fhahn](https://github.com/fhahn) (GitHub)
 
+#### MemorySSA
+
+Alina Sbirlea \
+asbirlea@google.com (email), [alinas](https://github.com/alinas) (GitHub)
+
 #### SandboxVectorizer
 
 Vasileios Porpodas \
@@ -456,6 +463,7 @@ Chandler Carruth (chandlerc@gmail.com, chandlerc@google.com, [chandlerc](https:/
 Peter Collingbourne (peter@pcc.me.uk, [pcc](https://github.com/pcc)) -- LTO \
 Evan Cheng (evan.cheng@apple.com) -- Parts of code generator not covered by someone else \
 Jake Ehrlich (jakehehrlich@google.com, [jakehehrlich](https://github.com/jakehehrlich)) -- llvm-objcopy and ObjCopy library \
+Hal Finkel (hfinkel@anl.gov, [hfinkel](https://github.com/hfinkel) -- AliasAnalysis \
 Renato Golin (rengolin@systemcall.eu, [rengolin](https://github.com/rengolin)) -- ARM backend \
 Venkatraman Govindaraju (venkatra@cs.wisc.edu, [vegovin](https://github.com/vegovin) -- Sparc backend \
 James Grosbach (grosbach@apple.com) -- MC layer \

From 2bd18fbf204cca8fa861ea60a2a308305101f4ad Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Mon, 6 Jan 2025 12:10:46 +0300
Subject: [PATCH 07/49] [clang][NFC] Stop testing CWG2917 in C++98 mode

---
 clang/test/CXX/drs/cwg29xx.cpp | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/clang/test/CXX/drs/cwg29xx.cpp b/clang/test/CXX/drs/cwg29xx.cpp
index f4057dbf4a3ca..b06a073d34446 100644
--- a/clang/test/CXX/drs/cwg29xx.cpp
+++ b/clang/test/CXX/drs/cwg29xx.cpp
@@ -1,10 +1,12 @@
 // RUN: %clang_cc1 -std=c++98 -pedantic-errors -verify=expected,cxx98 %s
-// RUN: %clang_cc1 -std=c++11 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++14 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++17 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++20 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++23 -pedantic-errors -verify=expected %s
-// RUN: %clang_cc1 -std=c++2c -pedantic-errors -verify=expected %s
+// RUN: %clang_cc1 -std=c++11 -pedantic-errors -verify=expected,since-cxx11 %s
+// RUN: %clang_cc1 -std=c++14 -pedantic-errors -verify=expected,since-cxx11 %s
+// RUN: %clang_cc1 -std=c++17 -pedantic-errors -verify=expected,since-cxx11 %s
+// RUN: %clang_cc1 -std=c++20 -pedantic-errors -verify=expected,since-cxx11,since-cxx20 %s
+// RUN: %clang_cc1 -std=c++23 -pedantic-errors -verify=expected,since-cxx11,since-cxx20,since-cxx23 %s
+// RUN: %clang_cc1 -std=c++2c -pedantic-errors -verify=expected,since-cxx11,since-cxx20,since-cxx23,since-cxx26 %s
+
+// cxx98-no-diagnostics
 
 namespace cwg2913 { // cwg2913: 20
 
@@ -35,21 +37,25 @@ struct A {
 } // namespace cwg2915
 
 namespace cwg2917 { // cwg2917: 20 review 2024-07-30
+#if __cplusplus >= 201103L
 template <typename>
 class Foo;
 
-template<class ...> // cxx98-error {{variadic templates are a C++11 extension}}
+template<class ...>
 struct C {
   struct Nested { };
 };
 
 struct S {
   template <typename>
-  friend class Foo, int; // expected-error {{a friend declaration that befriends a template must contain exactly one type-specifier}}
+  friend class Foo, int;
+  // since-cxx11-error@-1 {{a friend declaration that befriends a template must contain exactly one type-specifier}}
 
-  template <typename ...Ts> // cxx98-error {{variadic templates are a C++11 extension}}
-  friend class C<Ts>::Nested...; // expected-error {{friend declaration expands pack 'Ts' that is declared it its own template parameter list}}
+  template <typename ...Ts>
+  friend class C<Ts>::Nested...;
+  // since-cxx11-error@-1 {{friend declaration expands pack 'Ts' that is declared it its own template parameter list}}
 };
+#endif
 } // namespace cwg2917
 
 #if __cplusplus > 202302L

From d1d5403bea8229b475084db8a8b521288f192c88 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Mon, 6 Jan 2025 12:29:03 +0300
Subject: [PATCH 08/49] [clang][NFC] Fix expected directives in C++ DRs

If directive is put inside `#if __cplusplus`, it should reflect the condition, instead of being generic `expected`.
---
 clang/test/CXX/drs/cwg0xx.cpp  | 16 +++---
 clang/test/CXX/drs/cwg14xx.cpp | 98 +++++++++++++++++-----------------
 clang/test/CXX/drs/cwg18xx.cpp | 27 +++++-----
 clang/test/CXX/drs/cwg1xx.cpp  | 14 ++---
 clang/test/CXX/drs/cwg2335.cpp |  8 ++-
 clang/test/CXX/drs/cwg23xx.cpp | 20 +++----
 clang/test/CXX/drs/cwg24xx.cpp | 48 ++++++++---------
 clang/test/CXX/drs/cwg25xx.cpp | 28 +++++-----
 clang/test/CXX/drs/cwg26xx.cpp | 15 +++---
 clang/test/CXX/drs/cwg28xx.cpp | 20 +++----
 clang/test/CXX/drs/cwg29xx.cpp | 16 +++---
 clang/test/CXX/drs/cwg3xx.cpp  |  6 +--
 clang/test/CXX/drs/cwg4xx.cpp  |  6 +--
 clang/test/CXX/drs/cwg5xx.cpp  |  2 +-
 clang/test/CXX/drs/cwg6xx.cpp  | 14 ++---
 15 files changed, 166 insertions(+), 172 deletions(-)

diff --git a/clang/test/CXX/drs/cwg0xx.cpp b/clang/test/CXX/drs/cwg0xx.cpp
index 2e2e6d4e662d6..ca969dc5fcec1 100644
--- a/clang/test/CXX/drs/cwg0xx.cpp
+++ b/clang/test/CXX/drs/cwg0xx.cpp
@@ -709,11 +709,11 @@ namespace cwg39 { // cwg39: no
     // expected-error@#cwg39-sizeof {{unknown type name}}
 #if __cplusplus >= 201103L
     decltype(D::n) n;
-    /* expected-error@-1
+    /* since-cxx11-error@-1
     {{non-static member 'n' found in multiple base-class subobjects of type 'A':
     struct cwg39::PR5916::D -> B -> A
     struct cwg39::PR5916::D -> C -> A}} */
-    //   expected-note@#cwg39-A-n {{member found by ambiguous name lookup}}
+    //   since-cxx11-note@#cwg39-A-n {{member found by ambiguous name lookup}}
 #endif
   }
 } // namespace cwg39
@@ -1150,8 +1150,8 @@ namespace cwg73 { // cwg73: sup 1652
 #if __cplusplus >= 201103L
   int a, b;
   static_assert(&a + 1 != &b, "");
-  // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-  //   expected-note@-2 {{comparison against pointer '&a + 1' that points past the end of a complete object has unspecified value}}
+  // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+  //   since-cxx11-note@-2 {{comparison against pointer '&a + 1' that points past the end of a complete object has unspecified value}}
 #endif
 } // namespace cwg73
 
@@ -1255,14 +1255,14 @@ namespace cwg85 { // cwg85: 3.4
     enum E1 : int;
     enum E1 : int { e1 }; // #cwg85-E1-def
     enum E1 : int;
-    // expected-error@-1 {{class member cannot be redeclared}}
-    //   expected-note@#cwg85-E1-def {{previous declaration is here}}
+    // since-cxx11-error@-1 {{class member cannot be redeclared}}
+    //   since-cxx11-note@#cwg85-E1-def {{previous declaration is here}}
 
     enum class E2;
     enum class E2 { e2 }; // #cwg85-E2-def
     enum class E2;
-    // expected-error@-1 {{class member cannot be redeclared}}
-    //   expected-note@#cwg85-E2-def {{previous declaration is here}}
+    // since-cxx11-error@-1 {{class member cannot be redeclared}}
+    //   since-cxx11-note@#cwg85-E2-def {{previous declaration is here}}
 #endif
   };
 
diff --git a/clang/test/CXX/drs/cwg14xx.cpp b/clang/test/CXX/drs/cwg14xx.cpp
index 149468eb292c9..8f9de5373e757 100644
--- a/clang/test/CXX/drs/cwg14xx.cpp
+++ b/clang/test/CXX/drs/cwg14xx.cpp
@@ -93,7 +93,7 @@ struct A;
 void f() {
   constexpr A* a = nullptr;
   constexpr int p = &*a;
-  // expected-error@-1 {{cannot initialize a variable of type 'const int' with an rvalue of type 'A *'}}
+  // since-cxx11-error@-1 {{cannot initialize a variable of type 'const int' with an rvalue of type 'A *'}}
   constexpr A *p2 = &*a;
 }
 
@@ -108,27 +108,27 @@ namespace cwg1460 { // cwg1460: 3.5
   namespace DRExample {
     union A {
       union {};
-      // expected-error@-1 {{declaration does not declare anything}}
+      // since-cxx11-error@-1 {{declaration does not declare anything}}
       union {};
-      // expected-error@-1 {{declaration does not declare anything}}
+      // since-cxx11-error@-1 {{declaration does not declare anything}}
       constexpr A() {}
     };
     constexpr A a = A();
 
     union B {
       union {};
-      // expected-error@-1 {{declaration does not declare anything}}
+      // since-cxx11-error@-1 {{declaration does not declare anything}}
       union {};
-      // expected-error@-1 {{declaration does not declare anything}}
+      // since-cxx11-error@-1 {{declaration does not declare anything}}
       constexpr B() = default;
     };
     constexpr B b = B();
 
     union C {
       union {};
-      // expected-error@-1 {{declaration does not declare anything}}
+      // since-cxx11-error@-1 {{declaration does not declare anything}}
       union {};
-      // expected-error@-1 {{declaration does not declare anything}}
+      // since-cxx11-error@-1 {{declaration does not declare anything}}
     };
     constexpr C c = C();
 #if __cplusplus >= 201403L
@@ -141,7 +141,7 @@ namespace cwg1460 { // cwg1460: 3.5
   union B { int n; }; // #cwg1460-B
   union C { int n = 0; };
   struct D { union {}; };
-  // expected-error@-1 {{declaration does not declare anything}}
+  // since-cxx11-error@-1 {{declaration does not declare anything}}
   struct E { union { int n; }; }; // #cwg1460-E
   struct F { union { int n = 0; }; };
 
@@ -173,7 +173,7 @@ namespace cwg1460 { // cwg1460: 3.5
     // cxx11-17-error@-1 {{defaulted definition of default constructor cannot be marked constexpr}}
     union C { int n = 0; constexpr C() = default; };
     struct D { union {}; constexpr D() = default; };
-    // expected-error@-1 {{declaration does not declare anything}}
+    // since-cxx11-error@-1 {{declaration does not declare anything}}
     struct E { union { int n; }; constexpr E() = default; };
     // cxx11-17-error@-1 {{defaulted definition of default constructor cannot be marked constexpr}}
     struct F { union { int n = 0; }; constexpr F() = default; };
@@ -222,8 +222,8 @@ namespace cwg1460 { // cwg1460: 3.5
   union G {
     int a = 0; // #cwg1460-G-a
     int b = 0;
-    // expected-error@-1 {{initializing multiple members of union}}
-    //   expected-note@#cwg1460-G-a {{previous initialization is here}}
+    // since-cxx11-error@-1 {{initializing multiple members of union}}
+    //   since-cxx11-note@#cwg1460-G-a {{previous initialization is here}}
   };
   union H {
     union {
@@ -231,16 +231,16 @@ namespace cwg1460 { // cwg1460: 3.5
     };
     union {
       int b = 0;
-      // expected-error@-1 {{initializing multiple members of union}}
-      //   expected-note@#cwg1460-H-a {{previous initialization is here}}
+      // since-cxx11-error@-1 {{initializing multiple members of union}}
+      //   since-cxx11-note@#cwg1460-H-a {{previous initialization is here}}
     };
   };
   struct I {
     union {
       int a = 0; // #cwg1460-I-a
       int b = 0;
-      // expected-error@-1 {{initializing multiple members of union}}
-      //   expected-note@#cwg1460-I-a {{previous initialization is here}}
+      // since-cxx11-error@-1 {{initializing multiple members of union}}
+      //   since-cxx11-note@#cwg1460-I-a {{previous initialization is here}}
     };
   };
   struct J {
@@ -264,23 +264,23 @@ namespace cwg1460 { // cwg1460: 3.5
     };
     static_assert(B().a == 1, "");
     static_assert(B().b == 2, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'b' of union with active member 'a' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'b' of union with active member 'a' is not allowed in a constant expression}}
     static_assert(B('x').a == 0, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'a' of union with active member 'b' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'a' of union with active member 'b' is not allowed in a constant expression}}
     static_assert(B('x').b == 4, "");
     static_assert(B(123).b == 2, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'b' of union with active member 'c' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'b' of union with active member 'c' is not allowed in a constant expression}}
     static_assert(B(123).c == 3, "");
     static_assert(B("").a == 1, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'a' of union with active member 'b' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'a' of union with active member 'b' is not allowed in a constant expression}}
     static_assert(B("").b == 2, "");
     static_assert(B("").c == 3, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}}
 
     struct C {
       union { int a, b = 2, c; };
@@ -294,54 +294,54 @@ namespace cwg1460 { // cwg1460: 3.5
 
     static_assert(C().a == 1, "");
     static_assert(C().b == 2, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'b' of union with active member 'a' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'b' of union with active member 'a' is not allowed in a constant expression}}
     static_assert(C().d == 4, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'd' of union with active member 'e' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'd' of union with active member 'e' is not allowed in a constant expression}}
     static_assert(C().e == 5, "");
 
     static_assert(C('x').b == 2, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'b' of union with active member 'c' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'b' of union with active member 'c' is not allowed in a constant expression}}
     static_assert(C('x').c == 3, "");
     static_assert(C('x').d == 4, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'd' of union with active member 'e' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'd' of union with active member 'e' is not allowed in a constant expression}}
     static_assert(C('x').e == 5, "");
 
     static_assert(C(1).b == 2, "");
     static_assert(C(1).c == 3, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}}
     static_assert(C(1).d == 4, "");
     static_assert(C(1).e == 5, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'e' of union with active member 'd' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'e' of union with active member 'd' is not allowed in a constant expression}}
 
     static_assert(C(1.f).b == 2, "");
     static_assert(C(1.f).c == 3, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}}
     static_assert(C(1.f).e == 5, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'e' of union with active member 'f' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'e' of union with active member 'f' is not allowed in a constant expression}}
     static_assert(C(1.f).f == 6, "");
 
     static_assert(C("").a == 1, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'a' of union with active member 'b' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'a' of union with active member 'b' is not allowed in a constant expression}}
     static_assert(C("").b == 2, "");
     static_assert(C("").c == 3, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}}
     static_assert(C("").d == 4, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'd' of union with active member 'e' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'd' of union with active member 'e' is not allowed in a constant expression}}
     static_assert(C("").e == 5, "");
     static_assert(C("").f == 6, "");
-    // expected-error@-1 {{static assertion expression is not an integral constant expression}}
-    //   expected-note@-2 {{read of member 'f' of union with active member 'e' is not allowed in a constant expression}}
+    // since-cxx11-error@-1 {{static assertion expression is not an integral constant expression}}
+    //   since-cxx11-note@-2 {{read of member 'f' of union with active member 'e' is not allowed in a constant expression}}
 
     struct D;
     extern const D d;
diff --git a/clang/test/CXX/drs/cwg18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp
index 4beeb41ac3728..e9993326bb058 100644
--- a/clang/test/CXX/drs/cwg18xx.cpp
+++ b/clang/test/CXX/drs/cwg18xx.cpp
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,cxx98 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,cxx11-17,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,cxx98-14,cxx11-17,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,cxx11-17,since-cxx11,since-cxx14,cxx17 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-20,cxx98-14,cxx11-17,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-20,since-cxx14,cxx98-14,cxx11-17,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-20,since-cxx14,since-cxx17,cxx11-17,since-cxx11,since-cxx14,cxx17 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-20,since-cxx14,since-cxx17,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx23,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx23,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
 
@@ -274,8 +274,8 @@ void d2() {
 #if __cplusplus >= 201103L
 auto e = [] {
   typedef int cwg1820::A;
-  // expected-error@-1 {{definition or redeclaration of 'A' not allowed inside a function}}
-  // expected-error@-2 {{typedef declarator cannot be qualified}}
+  // since-cxx11-error@-1 {{definition or redeclaration of 'A' not allowed inside a function}}
+  // since-cxx11-error@-2 {{typedef declarator cannot be qualified}}
 };
 #endif
 } // namespace cwg1820
@@ -325,9 +325,9 @@ enum E { // #cwg1832-E
 
 #if __cplusplus >= 201103L
 enum E2: decltype(static_cast<E2>(0), 0) {};
-// expected-error@-1 {{unknown type name 'E2'}}
+// since-cxx11-error@-1 {{unknown type name 'E2'}}
 enum class E3: decltype(static_cast<E3>(0), 0) {};
-// expected-error@-1 {{unknown type name 'E3'}}
+// since-cxx11-error@-1 {{unknown type name 'E3'}}
 #endif
 } // namespace cwg1832
 
@@ -488,15 +488,12 @@ namespace cwg1872 { // cwg1872: 9
   // cxx11-17-error@-1 {{constexpr variable 'y2' must be initialized by a constant expression}}
   //   cxx11-17-note@-2 {{cannot evaluate call to virtual function in a constant expression in C++ standards before C++20}}
 #if __cplusplus >= 202002L
-  static_assert(y == 0);
+  static_assert(y2 == 0);
 #endif
   constexpr int z = A<Z>().f();
-  // since-cxx11-error@-1 {{constexpr variable 'z' must be initialized by a constant expression}}a
-#if __cplusplus < 202302L
-  //   since-cxx11-note@-3 {{non-literal type 'A<Z>' cannot be used in a constant expression}}
-#else
-  //   since-cxx23-note@-5 {{cannot construct object of type 'A<cwg1872::Z>' with virtual base class in a constant expression}}
-#endif
+  // since-cxx11-error@-1 {{constexpr variable 'z' must be initialized by a constant expression}}
+  //   cxx11-20-note@-2 {{non-literal type 'A<Z>' cannot be used in a constant expression}}
+  //   since-cxx23-note@-3 {{cannot construct object of type 'A<cwg1872::Z>' with virtual base class in a constant expression}}
 #endif
 } // namespace cwg1872
 
diff --git a/clang/test/CXX/drs/cwg1xx.cpp b/clang/test/CXX/drs/cwg1xx.cpp
index 39dfd310e93a3..f968ab9c20b27 100644
--- a/clang/test/CXX/drs/cwg1xx.cpp
+++ b/clang/test/CXX/drs/cwg1xx.cpp
@@ -279,19 +279,19 @@ namespace cwg115 { // cwg115: 3.0
   // Special case kicks in only if a template argument list is specified.
   template<typename T=int> void with_default(); // #cwg115-with-default
   int k10 = f(&with_default);
-  // expected-error@-1 {{no matching function for call to 'f'}}
-  //   expected-note@#cwg115-f {{candidate template ignored: couldn't infer template argument 'T'}}
+  // since-cxx11-error@-1 {{no matching function for call to 'f'}}
+  //   since-cxx11-note@#cwg115-f {{candidate template ignored: couldn't infer template argument 'T'}}
   int k11 = f(&with_default<>);
   void k() {
     (void)&with_default;
-    // expected-error@-1 {{address of overloaded function 'with_default' cannot be cast to type 'void'}}
-    //   expected-note@#cwg115-with-default {{candidate function template}}
+    // since-cxx11-error@-1 {{address of overloaded function 'with_default' cannot be cast to type 'void'}}
+    //   since-cxx11-note@#cwg115-with-default {{candidate function template}}
     (void)&with_default<>;
     &with_default;
-    // expected-error@-1 {{reference to overloaded function could not be resolved; did you mean to call it?}}
-    //   expected-note@#cwg115-with-default {{possible target for call}}
+    // since-cxx11-error@-1 {{reference to overloaded function could not be resolved; did you mean to call it?}}
+    //   since-cxx11-note@#cwg115-with-default {{possible target for call}}
     &with_default<>;
-    // expected-warning@-1 {{expression result unused}}
+    // since-cxx11-warning@-1 {{expression result unused}}
   }
 #endif
 } // namespace cwg115
diff --git a/clang/test/CXX/drs/cwg2335.cpp b/clang/test/CXX/drs/cwg2335.cpp
index 8b00a9d2d98a5..805c272f2f619 100644
--- a/clang/test/CXX/drs/cwg2335.cpp
+++ b/clang/test/CXX/drs/cwg2335.cpp
@@ -1,14 +1,12 @@
-// RUN: %clang_cc1 -std=c++98 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++11 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98-11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++11 %s -verify=expected,cxx98-11 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++14 %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
 
-#if __cplusplus <= 201103L
-// expected-no-diagnostics
-#endif
+// cxx98-11-no-diagnostics
 
 namespace cwg2335 { // cwg2335: no drafting 2018-06
 // FIXME: current consensus is that the examples are well-formed.
diff --git a/clang/test/CXX/drs/cwg23xx.cpp b/clang/test/CXX/drs/cwg23xx.cpp
index 0169d9c2a0332..7ab66ce89f286 100644
--- a/clang/test/CXX/drs/cwg23xx.cpp
+++ b/clang/test/CXX/drs/cwg23xx.cpp
@@ -341,26 +341,26 @@ enum E1 : int;
 struct A {
   friend enum class E0;
   // since-cxx11-error@-1 {{reference to enumeration must use 'enum' not 'enum class'}}
-  // expected-error@-2 {{elaborated enum specifier cannot be declared as a friend}}
-  // expected-note@-3 {{remove 'enum class' to befriend an enum}}
+  // since-cxx11-error@-2 {{elaborated enum specifier cannot be declared as a friend}}
+  //   since-cxx11-note@-3 {{remove 'enum class' to befriend an enum}}
 
   friend enum E0;
-  // expected-error@-1 {{elaborated enum specifier cannot be declared as a friend}}
-  // expected-note@-2 {{remove 'enum' to befriend an enum}}
+  // since-cxx11-error@-1 {{elaborated enum specifier cannot be declared as a friend}}
+  //   since-cxx11-note@-2 {{remove 'enum' to befriend an enum}}
 
   friend enum class E1;
   // since-cxx11-error@-1 {{reference to enumeration must use 'enum' not 'enum class'}}
-  // expected-error@-2 {{elaborated enum specifier cannot be declared as a friend}}
-  // expected-note@-3 {{remove 'enum class' to befriend an enum}}
+  // since-cxx11-error@-2 {{elaborated enum specifier cannot be declared as a friend}}
+  //   since-cxx11-note@-3 {{remove 'enum class' to befriend an enum}}
 
   friend enum E1;
-  // expected-error@-1 {{elaborated enum specifier cannot be declared as a friend}}
-  // expected-note@-2 {{remove 'enum' to befriend an enum}}
+  // since-cxx11-error@-1 {{elaborated enum specifier cannot be declared as a friend}}
+  //   since-cxx11-note@-2 {{remove 'enum' to befriend an enum}}
 
   friend enum class E2;
   // since-cxx11-error@-1 {{reference to enumeration must use 'enum' not 'enum class'}}
-  // expected-error@-2 {{elaborated enum specifier cannot be declared as a friend}}
-  // expected-note@-3 {{remove 'enum class' to befriend an enum}}
+  // since-cxx11-error@-2 {{elaborated enum specifier cannot be declared as a friend}}
+  //   since-cxx11-note@-3 {{remove 'enum class' to befriend an enum}}
 };
 #endif
 } // namespace cwg2363
diff --git a/clang/test/CXX/drs/cwg24xx.cpp b/clang/test/CXX/drs/cwg24xx.cpp
index 316622460e147..9c9a3f14b9e8b 100644
--- a/clang/test/CXX/drs/cwg24xx.cpp
+++ b/clang/test/CXX/drs/cwg24xx.cpp
@@ -2,9 +2,9 @@
 // RUN: %clang_cc1 -std=c++11 -pedantic-errors %s -verify=expected,cxx98-14
 // RUN: %clang_cc1 -std=c++14 -pedantic-errors %s -verify=expected,cxx98-14
 // RUN: %clang_cc1 -std=c++17 -pedantic-errors %s -verify=expected,since-cxx17
-// RUN: %clang_cc1 -std=c++20 -pedantic-errors %s -verify=expected,since-cxx17
-// RUN: %clang_cc1 -std=c++23 -pedantic-errors %s -verify=expected,since-cxx17
-// RUN: %clang_cc1 -std=c++2c -pedantic-errors %s -verify=expected,since-cxx17
+// RUN: %clang_cc1 -std=c++20 -pedantic-errors %s -verify=expected,since-cxx20,since-cxx17
+// RUN: %clang_cc1 -std=c++23 -pedantic-errors %s -verify=expected,since-cxx20,since-cxx17
+// RUN: %clang_cc1 -std=c++2c -pedantic-errors %s -verify=expected,since-cxx20,since-cxx17
 
 namespace cwg2406 { // cwg2406: 5
 #if __cplusplus >= 201703L
@@ -48,45 +48,45 @@ concept C [[deprecated]] = true; // #cwg2428-C
 
 template <typename>
 [[deprecated]] concept C2 = true;
-// expected-error@-1 {{expected unqualified-id}}
+// since-cxx20-error@-1 {{expected unqualified-id}}
 
 template <typename T>
 concept C3 = C<T>;
-// expected-warning@-1 {{'C' is deprecated}}
-//   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+// since-cxx20-warning@-1 {{'C' is deprecated}}
+//   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
 
 template <typename T, C U>
-// expected-warning@-1 {{'C' is deprecated}}
-//   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+// since-cxx20-warning@-1 {{'C' is deprecated}}
+//   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
 requires C<T>
-// expected-warning@-1 {{'C' is deprecated}}
-//   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+// since-cxx20-warning@-1 {{'C' is deprecated}}
+//   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
 void f() {
   bool b = C<int>;
-  // expected-warning@-1 {{'C' is deprecated}}
-  //   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+  // since-cxx20-warning@-1 {{'C' is deprecated}}
+  //   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
 };
 
 void g(C auto a) {};
-// expected-warning@-1 {{'C' is deprecated}}
-//   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+// since-cxx20-warning@-1 {{'C' is deprecated}}
+//   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
 
 template <typename T>
 auto h() -> C auto {
-// expected-warning@-1 {{'C' is deprecated}}
-//   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+// since-cxx20-warning@-1 {{'C' is deprecated}}
+//   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
   C auto foo = T();
-  // expected-warning@-1 {{'C' is deprecated}}
-  //   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+  // since-cxx20-warning@-1 {{'C' is deprecated}}
+  //   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
   C auto *bar = T();
-  // expected-warning@-1 {{'C' is deprecated}}
-  //   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+  // since-cxx20-warning@-1 {{'C' is deprecated}}
+  //   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
   C auto &baz = T();
-  // expected-warning@-1 {{'C' is deprecated}}
-  //   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+  // since-cxx20-warning@-1 {{'C' is deprecated}}
+  //   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
   C auto &&quux = T();
-  // expected-warning@-1 {{'C' is deprecated}}
-  //   expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
+  // since-cxx20-warning@-1 {{'C' is deprecated}}
+  //   since-cxx20-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}}
   return foo;
 }
 #endif
diff --git a/clang/test/CXX/drs/cwg25xx.cpp b/clang/test/CXX/drs/cwg25xx.cpp
index a6e2396f84969..d9a7d2bbb2671 100644
--- a/clang/test/CXX/drs/cwg25xx.cpp
+++ b/clang/test/CXX/drs/cwg25xx.cpp
@@ -86,8 +86,8 @@ operator""  _div();
 // since-cxx11-warning@-1 {{identifier '_div' preceded by whitespace in a literal operator declaration is deprecated}}
 
 using ::cwg2521::operator"" _\u03C0___;
+// since-cxx11-warning@-1 {{identifier '_π___' preceded by whitespace in a literal operator declaration is deprecated}}
 using ::cwg2521::operator""_div;
-// since-cxx11-warning@-2 {{identifier '_π___' preceded by whitespace in a literal operator declaration is deprecated}}
 
 long double operator"" _RESERVED(long double);
 // since-cxx11-warning@-1 {{identifier '_RESERVED' preceded by whitespace in a literal operator declaration is deprecated}}
@@ -206,7 +206,7 @@ namespace cwg2565 { // cwg2565: 16 open 2023-06-07
   struct TwoParamsStruct{};
 
   using TPSU = TwoParamsStruct<void, void>;
-  // since-cxx20-error@-1 {{constraints not satisfied for class template 'TwoParamsStruct'}}
+  // since-cxx20-error@-1 {{constraints not satisfied for class template 'TwoParamsStruct' [with T = void, U = void]}}
   //   since-cxx20-note@#cwg2565-TPSREQ {{because 'TwoParams<void, void>' evaluated to false}}
   //   since-cxx20-note@#cwg2565-TPC {{because 'b' would be invalid: argument may not have 'void' type}}
 
@@ -218,15 +218,15 @@ namespace cwg2565 { // cwg2565: 16 open 2023-06-07
   struct VariadicStruct{};
 
   using VSU = VariadicStruct<void, int, char, double>;
-  // since-cxx20-error@-1 {{constraints not satisfied for class template 'VariadicStruct'}}
+  // since-cxx20-error@-1 {{constraints not satisfied for class template 'VariadicStruct' [with T = void, U = <int, char, double>]}}
   //   since-cxx20-note@#cwg2565-VSREQ {{because 'Variadic<void, int, char, double>' evaluated to false}}
   //   since-cxx20-note@#cwg2565-VC {{because 'b' would be invalid: argument may not have 'void' type}}
 
   template<typename T>
   concept ErrorRequires = requires (ErrorRequires auto x) {
-  // since-cxx20-error@-1 {{a concept definition cannot refer to itself}} \
-  // since-cxx20-error@-1 {{'auto' not allowed in requires expression parameter}} \
-  // since-cxx20-note@-1 {{declared here}}
+  // since-cxx20-error@-1 {{a concept definition cannot refer to itself}}
+  //   since-cxx20-note@-2 {{declared here}}
+  // since-cxx20-error@-3 {{'auto' not allowed in requires expression parameter}}
     x;
   };
   static_assert(ErrorRequires<int>);
@@ -234,17 +234,17 @@ namespace cwg2565 { // cwg2565: 16 open 2023-06-07
   //   since-cxx20-note@-2 {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 
   template<typename T>
-  concept NestedErrorInRequires = requires (T x) { //
-    // since-cxx20-note@-1 {{declared here}}
+  concept NestedErrorInRequires = requires (T x) { // #cwg2565-NEIR
     requires requires (NestedErrorInRequires auto y) {
-    // since-cxx20-error@-1 {{a concept definition cannot refer to itself}} \
-    // since-cxx20-error@-1 {{'auto' not allowed in requires expression parameter}}
+    // since-cxx20-error@-1 {{a concept definition cannot refer to itself}}
+    //   since-cxx20-note@#cwg2565-NEIR {{declared here}}
+    // since-cxx20-error@-3 {{'auto' not allowed in requires expression parameter}}
       y;
     };
   };
   static_assert(NestedErrorInRequires<int>);
-  // expected-error@-1 {{static assertion failed}}
-  //   expected-note@-2 {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // since-cxx20-error@-1 {{static assertion failed}}
+  //   since-cxx20-note@-2 {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 
 #endif
 } // namespace cwg2565
@@ -286,10 +286,10 @@ struct X {
   // e.g., "if an explicit object parameter is used it must be of type reference to 'X'"
   X& operator=(this int, const X&) = default;
   // since-cxx23-warning@-1 {{explicitly defaulted copy assignment operator is implicitly deleted}}
-  // since-cxx23-note@-2 {{function is implicitly deleted because its declared type does not match the type of an implicit copy assignment operator}}
+  //   since-cxx23-note@-2 {{function is implicitly deleted because its declared type does not match the type of an implicit copy assignment operator}}
   X& operator=(this X, const X&) = default;
   // since-cxx23-warning@-1 {{explicitly defaulted copy assignment operator is implicitly deleted}}
-  // since-cxx23-note@-2 {{function is implicitly deleted because its declared type does not match the type of an implicit copy assignment operator}}
+  //   since-cxx23-note@-2 {{function is implicitly deleted because its declared type does not match the type of an implicit copy assignment operator}}
 };
 struct Y {
   void operator=(this int, const Y&); // This is copy constructor, suppresses implicit declaration
diff --git a/clang/test/CXX/drs/cwg26xx.cpp b/clang/test/CXX/drs/cwg26xx.cpp
index 58e047f72bf7c..23d7635ff9065 100644
--- a/clang/test/CXX/drs/cwg26xx.cpp
+++ b/clang/test/CXX/drs/cwg26xx.cpp
@@ -364,16 +364,13 @@ namespace cwg2692 { // cwg2692: 19
 
   void A::g() {
     (&A::f)(A());
-    // expected-error@-1 {{call to 'f' is ambiguous}}
-    // expected-note@#cwg2692-1 {{candidate}}
-    // expected-note@#cwg2692-2 {{candidate}}
-
-
-
+    // since-cxx23-error@-1 {{call to 'f' is ambiguous}}
+    //   since-cxx23-note@#cwg2692-1 {{candidate function}}
+    //   since-cxx23-note@#cwg2692-2 {{candidate function}}
     (&A::f)();
-    // expected-error@-1 {{no matching function for call to 'f'}}
-    // expected-note@#cwg2692-1 {{candidate function not viable: requires 1 argument, but 0 were provided}}
-    // expected-note@#cwg2692-2 {{candidate function not viable: requires 1 argument, but 0 were provided}}
+    // since-cxx23-error@-1 {{no matching function for call to 'f'}}
+    //   since-cxx23-note@#cwg2692-1 {{candidate function not viable: requires 1 argument, but 0 were provided}}
+    //   since-cxx23-note@#cwg2692-2 {{candidate function not viable: requires 1 argument, but 0 were provided}}
   }
 #endif
 } // namespace cwg2692
diff --git a/clang/test/CXX/drs/cwg28xx.cpp b/clang/test/CXX/drs/cwg28xx.cpp
index 2cad2814cd686..caa9b0f1f5058 100644
--- a/clang/test/CXX/drs/cwg28xx.cpp
+++ b/clang/test/CXX/drs/cwg28xx.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -std=c++98 -pedantic-errors -verify=expected,cxx98 %s
-// RUN: %clang_cc1 -std=c++11 -pedantic-errors -verify=expected,cxx11-23 %s
-// RUN: %clang_cc1 -std=c++14 -pedantic-errors -verify=expected,cxx11-23 %s
-// RUN: %clang_cc1 -std=c++17 -pedantic-errors -verify=expected,cxx11-23 %s
-// RUN: %clang_cc1 -std=c++20 -pedantic-errors -verify=expected,cxx11-23,since-cxx20 %s
-// RUN: %clang_cc1 -std=c++23 -pedantic-errors -verify=expected,cxx11-23,since-cxx20,since-cxx23 %s
-// RUN: %clang_cc1 -std=c++2c -pedantic-errors -verify=expected,since-cxx20,since-cxx23,since-cxx26 %s
+// RUN: %clang_cc1 -std=c++11 -pedantic-errors -verify=expected,since-cxx11,cxx11-23 %s
+// RUN: %clang_cc1 -std=c++14 -pedantic-errors -verify=expected,since-cxx11,cxx11-23 %s
+// RUN: %clang_cc1 -std=c++17 -pedantic-errors -verify=expected,since-cxx11,cxx11-23 %s
+// RUN: %clang_cc1 -std=c++20 -pedantic-errors -verify=expected,since-cxx11,cxx11-23,since-cxx20 %s
+// RUN: %clang_cc1 -std=c++23 -pedantic-errors -verify=expected,since-cxx11,cxx11-23,since-cxx20,since-cxx23 %s
+// RUN: %clang_cc1 -std=c++2c -pedantic-errors -verify=expected,since-cxx11,since-cxx20,since-cxx23,since-cxx26 %s
 
 
 int main() {} // required for cwg2811
@@ -14,14 +14,14 @@ namespace cwg2811 { // cwg2811: 3.5
 void f() {
   (void)[&] {
     using T = decltype(main);
-    // expected-error@-1 {{referring to 'main' within an expression is a Clang extension}}
+    // since-cxx11-error@-1 {{referring to 'main' within an expression is a Clang extension}}
   };
   using T2 = decltype(main);
-  // expected-error@-1 {{referring to 'main' within an expression is a Clang extension}}
+  // since-cxx11-error@-1 {{referring to 'main' within an expression is a Clang extension}}
 }
 
 using T = decltype(main);
-// expected-error@-1 {{referring to 'main' within an expression is a Clang extension}}
+// since-cxx11-error@-1 {{referring to 'main' within an expression is a Clang extension}}
 
 int main();
 
@@ -150,7 +150,7 @@ struct A {
   // FIXME: The index of the pack-index-specifier is printed as a memory address in the diagnostic.
   template<typename U>
   friend struct Ts...[0]::C;
-  // expected-warning-re@-1 {{dependent nested name specifier 'Ts...[{{.*}}]::' for friend template declaration is not supported; ignoring this friend declaration}}
+  // since-cxx26-warning@-1 {{dependent nested name specifier 'Ts...[0]::' for friend template declaration is not supported; ignoring this friend declaration}}
 };
 
 #endif
diff --git a/clang/test/CXX/drs/cwg29xx.cpp b/clang/test/CXX/drs/cwg29xx.cpp
index b06a073d34446..aeb62b8d68710 100644
--- a/clang/test/CXX/drs/cwg29xx.cpp
+++ b/clang/test/CXX/drs/cwg29xx.cpp
@@ -22,7 +22,8 @@ template<typename T>
 R(T) -> R<T> requires true;
 
 template<typename T>
-R(T, T) requires true -> R<T>; // expected-error {{expected function body after function declarator}}
+R(T, T) requires true -> R<T>;
+// since-cxx20-error@-1 {{expected function body after function declarator}}
 
 #endif
 
@@ -31,7 +32,8 @@ R(T, T) requires true -> R<T>; // expected-error {{expected function body after
 namespace cwg2915 { // cwg2915: 20
 #if __cplusplus >= 202302L
 struct A {
-  void f(this void); // expected-error {{explicit object parameter cannot have 'void' type}}
+  void f(this void);
+  // since-cxx23-error@-1 {{explicit object parameter cannot have 'void' type}}
 };
 #endif
 } // namespace cwg2915
@@ -63,7 +65,7 @@ namespace std {
   using size_t = decltype(sizeof(0));
 } // namespace std
 void *operator new(std::size_t, void *p) { return p; }
-void* operator new[] (std::size_t, void* p) {return p;}
+void* operator new[] (std::size_t, void* p) {return p; }
 #endif
 
 namespace cwg2922 { // cwg2922: 20
@@ -72,14 +74,14 @@ union U { int a, b; };
 constexpr U nondeterministic(bool i) {
   if(i) {
     U u;
-    new (&u) int();
-    // expected-note@-1 {{placement new would change type of storage from 'U' to 'int'}}
+    new (&u) int(); // #cwg2922-placement-new
     return u;
   }
   return {};
 }
 constexpr U _ = nondeterministic(true);
-// expected-error@-1 {{constexpr variable '_' must be initialized by a constant expression}} \
-// expected-note@-1 {{in call to 'nondeterministic(true)'}}
+// since-cxx26-error@-1 {{constexpr variable '_' must be initialized by a constant expression}}
+//   since-cxx26-note@#cwg2922-placement-new {{placement new would change type of storage from 'U' to 'int'}}
+//   since-cxx26-note@-3 {{in call to 'nondeterministic(true)'}}
 #endif
 } // namespace cwg2922
diff --git a/clang/test/CXX/drs/cwg3xx.cpp b/clang/test/CXX/drs/cwg3xx.cpp
index 4404d92cb47dd..d7319d465a93d 100644
--- a/clang/test/CXX/drs/cwg3xx.cpp
+++ b/clang/test/CXX/drs/cwg3xx.cpp
@@ -170,9 +170,9 @@ namespace cwg305 { // cwg305: no
   };
   void k(Z *z) {
     z->~T1<int>();
-    // expected-error@-1 {{no member named 'T1' in 'cwg305::Z'}}
+    // since-cxx11-error@-1 {{no member named 'T1' in 'cwg305::Z'}}
     z->~T2<int>();
-    // expected-error@-1 {{no member named '~int' in 'cwg305::Z'}}
+    // since-cxx11-error@-1 {{no member named '~int' in 'cwg305::Z'}}
     z->~T2<Z>();
   }
 
@@ -182,7 +182,7 @@ namespace cwg305 { // cwg305: no
   }
   template<typename A> using R = Q::R<int>;
   void qr(Q::R<int> x) { x.~R<char>(); }
-  // expected-error@-1 {{no member named '~R' in 'cwg305::Q::R<int>'}}
+  // since-cxx11-error@-1 {{no member named '~R' in 'cwg305::Q::R<int>'}}
 #endif
 } // namespace cwg305
 
diff --git a/clang/test/CXX/drs/cwg4xx.cpp b/clang/test/CXX/drs/cwg4xx.cpp
index 8c9f9e4ccb085..4051496ca2c4f 100644
--- a/clang/test/CXX/drs/cwg4xx.cpp
+++ b/clang/test/CXX/drs/cwg4xx.cpp
@@ -836,8 +836,8 @@ namespace cwg450 { // cwg450: yes
   void h() {
     f1(A{});
     f2(A{});
-    // expected-error@-1 {{no matching function for call to 'f2'}}}
-    //   expected-note@#cwg450-f2 {{candidate function not viable: expects an lvalue for 1st argument}}
+    // since-cxx11-error@-1 {{no matching function for call to 'f2'}}}
+    //   since-cxx11-note@#cwg450-f2 {{candidate function not viable: expects an lvalue for 1st argument}}
   }
 #endif
 } // namespace cwg450
@@ -1274,7 +1274,7 @@ namespace cwg482 { // cwg482: 3.5
 #if __cplusplus >= 201103L
     enum class C;
     enum class A::C {};
-    // expected-error@-1 {{extra qualification on member 'C'}}
+    // since-cxx11-error@-1 {{extra qualification on member 'C'}}
 #endif
   };
 } // namespace cwg482
diff --git a/clang/test/CXX/drs/cwg5xx.cpp b/clang/test/CXX/drs/cwg5xx.cpp
index 2e78138fd123f..3df69b5450949 100644
--- a/clang/test/CXX/drs/cwg5xx.cpp
+++ b/clang/test/CXX/drs/cwg5xx.cpp
@@ -17,7 +17,7 @@
 __extension__ typedef __SIZE_TYPE__ size_t;
 void *operator new(size_t); // #cwg5xx-global-operator-new
 // cxx98-error@-1 {{'operator new' is missing exception specification 'throw(std::bad_alloc)'}}
-#if __cplusplus > 201402L
+#if __cplusplus >= 201703L
 namespace std {
   enum class align_val_t : size_t {};
 } // namespace std
diff --git a/clang/test/CXX/drs/cwg6xx.cpp b/clang/test/CXX/drs/cwg6xx.cpp
index 38f440d4154ad..c7ef9e1caffe8 100644
--- a/clang/test/CXX/drs/cwg6xx.cpp
+++ b/clang/test/CXX/drs/cwg6xx.cpp
@@ -1153,9 +1153,9 @@ namespace cwg684 { // cwg684: sup 1454
   void f() {
     int a;  // #cwg684-a
     constexpr int *p = &a;
-    // expected-error@-1 {{constexpr variable 'p' must be initialized by a constant expression}}
-    //   expected-note@-2 {{pointer to 'a' is not a constant expression}}
-    //   expected-note@#cwg684-a {{here}}
+    // since-cxx11-error@-1 {{constexpr variable 'p' must be initialized by a constant expression}}
+    //   since-cxx11-note@-2 {{pointer to 'a' is not a constant expression}}
+    //   since-cxx11-note@#cwg684-a {{here}}
   }
 #endif
 } // namespace cwg684
@@ -1245,12 +1245,12 @@ namespace cwg686 { // cwg686: 3.0
     //   expected-note@-2 {{forward declaration of 'P'}}
     catch (struct P {} *) {}
     // expected-error@-1 {{'P' cannot be defined in a type specifier}}
-#if __cplusplus < 201703L
+#if __cplusplus <= 201402L
     void g() throw(struct Q);
-    // cxx98-17-error@-1 {{incomplete type 'struct Q' is not allowed in exception specification}}
-    //   cxx98-17-note@-2 {{forward declaration of 'Q'}}
+    // cxx98-14-error@-1 {{incomplete type 'struct Q' is not allowed in exception specification}}
+    //   cxx98-14-note@-2 {{forward declaration of 'Q'}}
     void h() throw(struct Q {});
-    // cxx98-17-error@-1 {{'Q' cannot be defined in a type specifier}}
+    // cxx98-14-error@-1 {{'Q' cannot be defined in a type specifier}}
 #endif
   }
   template<struct R *> struct X;

From c4b8f7ab0fb0dc79b4b7d5e292fdcf599351cd12 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Mon, 6 Jan 2025 01:37:56 -0800
Subject: [PATCH 09/49] [AArch64][SME] Fix broken compiler check for SME2
 support in compiler-rt (#121625)

This compile time test uses inline asm with `.arch` directives to set
the target feature. It is however broken and always fails, since each
`asm()` construct in LLVM sets up a new AsmParser, and therefore the
`.arch` directive has no effect on later `asm()` contents. To fix this
we need to use a single inline `asm()` call with the entire code chunk
to emit contained inside.
---
 compiler-rt/cmake/builtin-config-ix.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index 706a1ff7eeb6d..b1bde47ec8555 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -43,9 +43,9 @@ asm(\"cas w0, w1, [x2]\");
 builtin_check_c_compiler_source(COMPILER_RT_HAS_AARCH64_SME
 "
 void foo(void)  __arm_streaming_compatible {
-  asm(\".arch armv9-a+sme2\");
-  asm(\"smstart\");
-  asm(\"ldr zt0, [sp]\");
+  asm(\".arch armv9-a+sme2\\n\"
+      \"smstart\\n\"
+      \"ldr zt0, [sp]\");
 }
 ")
 

From 3ce85be5e231f758a71bfc98a1c2a96eb1d2dedf Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles@arm.com>
Date: Mon, 6 Jan 2025 09:39:50 +0000
Subject: [PATCH 10/49] [AArch64] Correct position of CFI Instruction for
 Pointer Authentication (#121559)

As part #112171, support for FEAT_PAuthLR's CFI instructions was added.
However, the CFI instructions are emitted in the incorrect location. This
leads to incorrect CodeGen being generated and possible issues when
running a program. According to the ABI, the CFI instructions should be
emitted before the signing instruction. This is now done properly.

ABI information can be found here:
https://github.com/ARM-software/abi-aa/blob/bf0e2c8047c70987165f3e05e571d7836370ade9/aadwarf64/aadwarf64.rst#44call-frame-instructions
---
 .../lib/Target/AArch64/AArch64PointerAuth.cpp | 12 ++--
 .../machine-outliner-retaddr-sign-cfi.ll      |  2 +-
 ...tliner-retaddr-sign-diff-scope-same-key.ll | 12 ++--
 .../machine-outliner-retaddr-sign-non-leaf.ll | 12 ++--
 .../machine-outliner-retaddr-sign-regsave.mir |  2 +-
 ...tliner-retaddr-sign-same-scope-diff-key.ll | 12 ++--
 .../machine-outliner-retaddr-sign-sp-mod.mir  | 34 ++++-----
 ...machine-outliner-retaddr-sign-subtarget.ll |  6 +-
 .../machine-outliner-retaddr-sign-thunk.ll    | 18 ++---
 .../AArch64/pacbti-llvm-generated-funcs-2.ll  |  4 +-
 ...sign-return-address-cfi-negate-ra-state.ll | 24 +++----
 .../AArch64/sign-return-address-pauth-lr.ll   | 70 +++++++++----------
 .../CodeGen/AArch64/sign-return-address.ll    | 44 ++++++------
 .../MIR/AArch64/return-address-signing.mir    |  6 +-
 14 files changed, 130 insertions(+), 128 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
index a290a5112d012..c3bc70ad6f427 100644
--- a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
@@ -144,20 +144,20 @@ void AArch64PointerAuth::signLR(MachineFunction &MF,
   // No SEH opcode for this one; it doesn't materialize into an
   // instruction on Windows.
   if (MFnI.branchProtectionPAuthLR() && Subtarget->hasPAuthLR()) {
+    emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameSetup, EmitCFI);
     BuildMI(MBB, MBBI, DL,
             TII->get(MFnI.shouldSignWithBKey() ? AArch64::PACIBSPPC
                                                : AArch64::PACIASPPC))
         .setMIFlag(MachineInstr::FrameSetup)
         ->setPreInstrSymbol(MF, MFnI.getSigningInstrLabel());
-    emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameSetup, EmitCFI);
   } else {
     BuildPACM(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameSetup);
+    emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameSetup, EmitCFI);
     BuildMI(MBB, MBBI, DL,
             TII->get(MFnI.shouldSignWithBKey() ? AArch64::PACIBSP
                                                : AArch64::PACIASP))
         .setMIFlag(MachineInstr::FrameSetup)
         ->setPreInstrSymbol(MF, MFnI.getSigningInstrLabel());
-    emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameSetup, EmitCFI);
   }
 
   if (!EmitCFI && NeedsWinCFI) {
@@ -212,19 +212,19 @@ void AArch64PointerAuth::authenticateLR(
     if (MFnI->branchProtectionPAuthLR() && Subtarget->hasPAuthLR()) {
       assert(PACSym && "No PAC instruction to refer to");
       emitPACSymOffsetIntoX16(*TII, MBB, MBBI, DL, PACSym);
+      emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameDestroy,
+                 EmitAsyncCFI);
       BuildMI(MBB, MBBI, DL,
               TII->get(UseBKey ? AArch64::AUTIBSPPCi : AArch64::AUTIASPPCi))
           .addSym(PACSym)
           .setMIFlag(MachineInstr::FrameDestroy);
-      emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameDestroy,
-                 EmitAsyncCFI);
     } else {
       BuildPACM(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameDestroy, PACSym);
+      emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameDestroy,
+                 EmitAsyncCFI);
       BuildMI(MBB, MBBI, DL,
               TII->get(UseBKey ? AArch64::AUTIBSP : AArch64::AUTIASP))
           .setMIFlag(MachineInstr::FrameDestroy);
-      emitPACCFI(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameDestroy,
-                 EmitAsyncCFI);
     }
 
     if (NeedsWinCFI) {
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll
index 4bbbe40176313..e7de54036245a 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll
@@ -9,9 +9,9 @@ define void @a() "sign-return-address"="all" "sign-return-address-key"="b_key" {
 ; CHECK-LABEL:         a:                     // @a
 ; CHECK:               // %bb.0:
 ; CHECK-NEXT:          .cfi_b_key_frame
+; CHECK-NEXT:          .cfi_negate_ra_state
 ; V8A-NEXT:            hint #27
 ; V83A-NEXT:           pacibsp
-; CHECK-NEXT:          .cfi_negate_ra_state
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   %3 = alloca i32, align 4
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll
index 6a11bef08c740..a26dda1d5c1f1 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll
@@ -5,9 +5,9 @@
 
 define void @a() "sign-return-address"="all" {
 ; CHECK-LABEL:      a:                                     // @a
-; V8A:              hint #25
-; V83A:             paciasp
-; CHECK-NEXT:      .cfi_negate_ra_state
+; CHECK:      .cfi_negate_ra_state
+; V8A-NEXT:              hint #25
+; V83A-NEXT:             paciasp
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   %3 = alloca i32, align 4
@@ -52,9 +52,9 @@ define void @b() "sign-return-address"="non-leaf" {
 
 define void @c() "sign-return-address"="all" {
 ; CHECK-LABEL:         c:              // @c
-; V8A:                 hint #25
-; V83A:                paciasp
-; CHECK-NEXT          .cfi_negate_ra_state
+; CHECK:      .cfi_negate_ra_state
+; V8A-NEXT:              hint #25
+; V83A-NEXT:             paciasp
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   %3 = alloca i32, align 4
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll
index 1e7224683c6c8..064b2b78c7bc7 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll
@@ -8,8 +8,8 @@ define i64 @a(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V8A-LABEL: a:
 ; V8A:       // %bb.0:
 ; V8A-NEXT:    .cfi_b_key_frame
-; V8A-NEXT:    hint #27
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #27
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
@@ -26,8 +26,8 @@ define i64 @a(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V83A-LABEL: a:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
@@ -59,8 +59,8 @@ define i64 @b(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V8A-LABEL: b:
 ; V8A:       // %bb.0:
 ; V8A-NEXT:    .cfi_b_key_frame
-; V8A-NEXT:    hint #27
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #27
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
@@ -77,8 +77,8 @@ define i64 @b(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V83A-LABEL: b:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
@@ -110,8 +110,8 @@ define i64 @c(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V8A-LABEL: c:
 ; V8A:       // %bb.0:
 ; V8A-NEXT:    .cfi_b_key_frame
-; V8A-NEXT:    hint #27
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #27
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
@@ -128,8 +128,8 @@ define i64 @c(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"
 ; V83A-LABEL: c:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir
index 9a983cbd6714e..218ee6609c803 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir
@@ -81,8 +81,8 @@ body:             |
 # CHECK:         name:            bar
 # CHECK:          bb.0:
 # CHECK:            frame-setup EMITBKEY
-# CHECK-NEXT:       frame-setup PACIBSP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:       frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:       frame-setup PACIBSP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NOT:        OUTLINED_FUNCTION_
 # CHECK:          bb.1:
 # CHECK-NOT:        OUTLINED_FUNCTION_
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll
index 87771f5de4f69..5c45373d8c1d6 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll
@@ -7,8 +7,8 @@
 define void @a() "sign-return-address"="all" {
 ; V8A-LABEL: a:
 ; V8A:       // %bb.0:
-; V8A-NEXT:    hint #25
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #25
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
 ; V8A-NEXT:    mov w8, #1 // =0x1
@@ -26,8 +26,8 @@ define void @a() "sign-return-address"="all" {
 ;
 ; V83A-LABEL: a:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
 ; V83A-NEXT:    mov w8, #1 // =0x1
@@ -60,8 +60,8 @@ define void @b() "sign-return-address"="all" "sign-return-address-key"="b_key" {
 ; V8A-LABEL: b:
 ; V8A:       // %bb.0:
 ; V8A-NEXT:    .cfi_b_key_frame
-; V8A-NEXT:    hint #27
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #27
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
 ; V8A-NEXT:    mov w8, #1 // =0x1
@@ -80,8 +80,8 @@ define void @b() "sign-return-address"="all" "sign-return-address-key"="b_key" {
 ; V83A-LABEL: b:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
 ; V83A-NEXT:    mov w8, #1 // =0x1
@@ -113,8 +113,8 @@ define void @b() "sign-return-address"="all" "sign-return-address-key"="b_key" {
 define void @c() "sign-return-address"="all" {
 ; V8A-LABEL: c:
 ; V8A:       // %bb.0:
-; V8A-NEXT:    hint #25
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #25
 ; V8A-NEXT:    sub sp, sp, #32
 ; V8A-NEXT:    .cfi_def_cfa_offset 32
 ; V8A-NEXT:    mov w8, #1 // =0x1
@@ -132,8 +132,8 @@ define void @c() "sign-return-address"="all" {
 ;
 ; V83A-LABEL: c:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    sub sp, sp, #32
 ; V83A-NEXT:    .cfi_def_cfa_offset 32
 ; V83A-NEXT:    mov w8, #1 // =0x1
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir
index 22e5edef2a939..d4a4b886ec0e3 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir
@@ -86,11 +86,11 @@ body:             |
 # CHECK:          body:             |
 # CHECK-NEXT:         bb.0 (%ir-block.0):
 # CHECK-NEXT:           liveins: $lr
-# CHECK:                frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK:                BL @[[OUTLINED_FUNC:OUTLINED_FUNCTION_[0-9]+]]
-# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:           RET undef $lr
 
 ...
@@ -119,11 +119,11 @@ body:             |
 # CHECK:          body:             |
 # CHECK-NEXT:         bb.0 (%ir-block.0):
 # CHECK-NEXT:           liveins: $lr
-# CHECK:                frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK:                BL @[[OUTLINED_FUNC]]
-# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:           RET undef $lr
 
 ...
@@ -174,22 +174,22 @@ body:             |
 # CHECK:          body:             |
 # CHECK-NEXT:         bb.0 (%ir-block.0):
 # CHECK-NEXT:           liveins: $lr
-# CHECK:                frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NOT:            BL @OUTLINED_FUNCTION_{{.*}}
-# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:           RET undef $lr
 
 # CHECK-LABEL:    name:            illegal1
 # CHECK:          body:             |
 # CHECK-NEXT:         bb.0 (%ir-block.0):
 # CHECK-NEXT:           liveins: $lr
-# CHECK:                frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NOT:            BL @OUTLINED_FUNCTION_{{.*}}
-# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
-# CHECK-NEXT:           frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                frame-destroy CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:           frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:           RET undef $lr
 
 # Outlined function that contains only legal sp modifications
@@ -198,8 +198,8 @@ body:             |
 # CHECK-NEXT:       bb.0:
 # CHECK-NEXT: liveins: $lr
 # CHECK-NEXT: {{^  $}}
-# CHECK-NEXT:         frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:         frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:         frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 # CHECK-NEXT:         $sp = frame-setup SUBXri $sp, 16, 0
 # CHECK:              $sp = frame-destroy ADDXri $sp, 16, 0
 # CHECK-NEXT:         frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll
index a7ea32952f3b7..cb43b3ba3e47e 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll
@@ -9,8 +9,8 @@ define void @a() #0 {
 ; CHECK-LABEL:      a:                                     // @a
 ; CHECK:            // %bb.0:
 ; CHECK-NEXT:               .cfi_b_key_frame
-; CHECK-NEXT:               pacibsp
 ; CHECK-NEXT:               .cfi_negate_ra_state
+; CHECK-NEXT:               pacibsp
 ; CHECK-NOT:                OUTLINED_FUNCTION_
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
@@ -33,8 +33,8 @@ define void @b() #0 {
 ; CHECK-LABEL:      b:                                     // @b
 ; CHECK:            // %bb.0:
 ; CHECK-NEXT:               .cfi_b_key_frame
-; CHECK-NEXT:               pacibsp
 ; CHECK-NEXT:               .cfi_negate_ra_state
+; CHECK-NEXT:               pacibsp
 ; CHECK-NOT:                OUTLINED_FUNCTION_
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
@@ -57,8 +57,8 @@ define void @c() #1 {
 ; CHECK-LABEL:      c:                                     // @c
 ; CHECK:            // %bb.0:
 ; CHECK-NEXT:               .cfi_b_key_frame
-; CHECK-NEXT:               hint #27
 ; CHECK-NEXT:               .cfi_negate_ra_state
+; CHECK-NEXT:               hint #27
 ; CHECK-NOT:                OUTLINED_FUNCTION_
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll
index da68ea5bf0dbc..0ba4455532925 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll
@@ -10,8 +10,8 @@ declare i32 @thunk_called_fn(i32, i32, i32, i32)
 define i32 @a() #0 {
 ; V8A-LABEL: a:
 ; V8A:       // %bb.0: // %entry
-; V8A-NEXT:    hint #25
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #25
 ; V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 16
 ; V8A-NEXT:    .cfi_offset w30, -16
@@ -27,8 +27,8 @@ define i32 @a() #0 {
 ;
 ; V83A-LABEL: a:
 ; V83A:       // %bb.0: // %entry
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -49,8 +49,8 @@ entry:
 define i32 @b() #0 {
 ; V8A-LABEL: b:
 ; V8A:       // %bb.0: // %entry
-; V8A-NEXT:    hint #25
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #25
 ; V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 16
 ; V8A-NEXT:    .cfi_offset w30, -16
@@ -66,8 +66,8 @@ define i32 @b() #0 {
 ;
 ; V83A-LABEL: b:
 ; V83A:       // %bb.0: // %entry
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -88,8 +88,8 @@ entry:
 define hidden i32 @c(ptr %fptr) #0 {
 ; V8A-LABEL: c:
 ; V8A:       // %bb.0: // %entry
-; V8A-NEXT:    hint #25
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #25
 ; V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 16
 ; V8A-NEXT:    .cfi_offset w30, -16
@@ -106,8 +106,8 @@ define hidden i32 @c(ptr %fptr) #0 {
 ;
 ; V83A-LABEL: c:
 ; V83A:       // %bb.0: // %entry
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -129,8 +129,8 @@ entry:
 define hidden i32 @d(ptr %fptr) #0 {
 ; V8A-LABEL: d:
 ; V8A:       // %bb.0: // %entry
-; V8A-NEXT:    hint #25
 ; V8A-NEXT:    .cfi_negate_ra_state
+; V8A-NEXT:    hint #25
 ; V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V8A-NEXT:    .cfi_def_cfa_offset 16
 ; V8A-NEXT:    .cfi_offset w30, -16
@@ -147,8 +147,8 @@ define hidden i32 @d(ptr %fptr) #0 {
 ;
 ; V83A-LABEL: d:
 ; V83A:       // %bb.0: // %entry
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -176,3 +176,5 @@ attributes #0 = { "sign-return-address"="non-leaf" minsize }
 ; CHECK-NOT:         .cfi_negate_ra_state
 ; CHECK-NOT:         auti{{[a,b]}}sp
 ; CHECK-NOT:         hint #{{[29,31]}}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/pacbti-llvm-generated-funcs-2.ll b/llvm/test/CodeGen/AArch64/pacbti-llvm-generated-funcs-2.ll
index 373c4969a9405..f823d2aa82ac0 100644
--- a/llvm/test/CodeGen/AArch64/pacbti-llvm-generated-funcs-2.ll
+++ b/llvm/test/CodeGen/AArch64/pacbti-llvm-generated-funcs-2.ll
@@ -34,8 +34,8 @@ entry:
 }
 ;; CHECK-LABEL: __llvm_gcov_writeout:
 ;; CHECK:       .cfi_b_key_frame
-;; CHECK-NEXT:  pacibsp
 ;; CHECK-NEXT:  .cfi_negate_ra_state
+;; CHECK-NEXT:  pacibsp
 
 define internal void @__llvm_gcov_reset() unnamed_addr #2 {
 entry:
@@ -54,9 +54,9 @@ entry:
 }
 ;; CHECK-LABEL: __llvm_gcov_init:
 ;; CHECK:      .cfi_b_key_frame
-;; CHECK-NEXT:  pacibsp
 ;; CHECK-NEXT:  .cfi_negate_ra_state
 ;; CHECK-NOT:   .cfi_
+;; CHECK-NEXT:  pacibsp
 ;; CHECK:       .cfi_endproc
 
 attributes #0 = { norecurse nounwind readnone "sign-return-address"="all" "sign-return-address-key"="b_key" }
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address-cfi-negate-ra-state.ll b/llvm/test/CodeGen/AArch64/sign-return-address-cfi-negate-ra-state.ll
index 4d4b7c215b978..6ea072846d47c 100644
--- a/llvm/test/CodeGen/AArch64/sign-return-address-cfi-negate-ra-state.ll
+++ b/llvm/test/CodeGen/AArch64/sign-return-address-cfi-negate-ra-state.ll
@@ -9,8 +9,8 @@
 define dso_local i32 @_Z3fooi(i32 %x) #0 {
 ; CHECK-V8A-LABEL: _Z3fooi:
 ; CHECK-V8A:       // %bb.0: // %entry
-; CHECK-V8A-NEXT:    hint #25
 ; CHECK-V8A-NEXT:    .cfi_negate_ra_state
+; CHECK-V8A-NEXT:    hint #25
 ; CHECK-V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V8A-NEXT:    .cfi_offset w30, -16
@@ -27,8 +27,8 @@ define dso_local i32 @_Z3fooi(i32 %x) #0 {
 ;
 ; CHECK-V83A-LABEL: _Z3fooi:
 ; CHECK-V83A:       // %bb.0: // %entry
-; CHECK-V83A-NEXT:    paciasp
 ; CHECK-V83A-NEXT:    .cfi_negate_ra_state
+; CHECK-V83A-NEXT:    paciasp
 ; CHECK-V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V83A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V83A-NEXT:    .cfi_offset w30, -16
@@ -62,8 +62,8 @@ return:                                           ; No predecessors!
 define hidden noundef i32 @baz_async(i32 noundef %a) #0 uwtable(async) {
 ; CHECK-V8A-LABEL: baz_async:
 ; CHECK-V8A:       // %bb.0: // %entry
-; CHECK-V8A-NEXT:    hint #25
 ; CHECK-V8A-NEXT:    .cfi_negate_ra_state
+; CHECK-V8A-NEXT:    hint #25
 ; CHECK-V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V8A-NEXT:    .cfi_offset w30, -16
@@ -74,8 +74,8 @@ define hidden noundef i32 @baz_async(i32 noundef %a) #0 uwtable(async) {
 ; CHECK-V8A-NEXT:    bl _Z3bari
 ; CHECK-V8A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-V8A-NEXT:    hint #29
 ; CHECK-V8A-NEXT:    .cfi_negate_ra_state
+; CHECK-V8A-NEXT:    hint #29
 ; CHECK-V8A-NEXT:    .cfi_restore w30
 ; CHECK-V8A-NEXT:    b _Z3bari
 ; CHECK-V8A-NEXT:  .LBB1_2: // %if.else
@@ -84,15 +84,15 @@ define hidden noundef i32 @baz_async(i32 noundef %a) #0 uwtable(async) {
 ; CHECK-V8A-NEXT:    add w0, w0, #1
 ; CHECK-V8A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-V8A-NEXT:    hint #29
 ; CHECK-V8A-NEXT:    .cfi_negate_ra_state
+; CHECK-V8A-NEXT:    hint #29
 ; CHECK-V8A-NEXT:    .cfi_restore w30
 ; CHECK-V8A-NEXT:    ret
 ;
 ; CHECK-V83A-LABEL: baz_async:
 ; CHECK-V83A:       // %bb.0: // %entry
-; CHECK-V83A-NEXT:    paciasp
 ; CHECK-V83A-NEXT:    .cfi_negate_ra_state
+; CHECK-V83A-NEXT:    paciasp
 ; CHECK-V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V83A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V83A-NEXT:    .cfi_offset w30, -16
@@ -103,8 +103,8 @@ define hidden noundef i32 @baz_async(i32 noundef %a) #0 uwtable(async) {
 ; CHECK-V83A-NEXT:    bl _Z3bari
 ; CHECK-V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-V83A-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-V83A-NEXT:    autiasp
 ; CHECK-V83A-NEXT:    .cfi_negate_ra_state
+; CHECK-V83A-NEXT:    autiasp
 ; CHECK-V83A-NEXT:    .cfi_restore w30
 ; CHECK-V83A-NEXT:    b _Z3bari
 ; CHECK-V83A-NEXT:  .LBB1_2: // %if.else
@@ -143,8 +143,8 @@ return:                                           ; preds = %if.else, %if.then
 define hidden noundef i32 @baz_sync(i32 noundef %a) #0 uwtable(sync) {
 ; CHECK-V8A-LABEL: baz_sync:
 ; CHECK-V8A:       // %bb.0: // %entry
-; CHECK-V8A-NEXT:    hint #25
 ; CHECK-V8A-NEXT:    .cfi_negate_ra_state
+; CHECK-V8A-NEXT:    hint #25
 ; CHECK-V8A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V8A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V8A-NEXT:    .cfi_offset w30, -16
@@ -164,8 +164,8 @@ define hidden noundef i32 @baz_sync(i32 noundef %a) #0 uwtable(sync) {
 ;
 ; CHECK-V83A-LABEL: baz_sync:
 ; CHECK-V83A:       // %bb.0: // %entry
-; CHECK-V83A-NEXT:    paciasp
 ; CHECK-V83A-NEXT:    .cfi_negate_ra_state
+; CHECK-V83A-NEXT:    paciasp
 ; CHECK-V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-V83A-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-V83A-NEXT:    .cfi_offset w30, -16
@@ -216,7 +216,7 @@ attributes #0 = { "sign-return-address"="all" }
 ; CHECK-DUMP-NOT: DW_CFA_remember_state
 ; CHECK-DUMP-NOT: DW_CFA_restore_state
 
-; CHECK-DUMP: CFA=WSP{{$}}
+; CHECK-DUMP: CFA=WSP
 ; CHECK-DUMP: reg34=1
 ; CHECK-DUMP-NOT: reg34=0
 
@@ -229,6 +229,7 @@ attributes #0 = { "sign-return-address"="all" }
 ; CHECK-DUMP:   DW_CFA_restore_state:
 ; CHECK-DUMP:   DW_CFA_AARCH64_negate_ra_state:
 
+; CHECK-DUMP: CFA=WSP
 ;; First DW_CFA_AARCH64_negate_ra_state:
 ; CHECK-DUMP: reg34=1
 ;; Second DW_CFA_AARCH64_negate_ra_state:
@@ -237,7 +238,6 @@ attributes #0 = { "sign-return-address"="all" }
 ; CHECK-DUMP: reg34=1
 ;; Third DW_CFA_AARCH64_negate_ra_state:
 ; CHECK-DUMP: reg34=0
-; CHECK-DUMP-NOT: reg34=1
 
 ; baz_sync
 ; CHECK-DUMP-LABEL: FDE
@@ -246,6 +246,6 @@ attributes #0 = { "sign-return-address"="all" }
 ; CHECK-DUMP-NOT: DW_CFA_remember_state
 ; CHECK-DUMP-NOT: DW_CFA_restore_state
 
-; CHECK-DUMP: CFA=WSP{{$}}
+; CHECK-DUMP: CFA=WSP
 ; CHECK-DUMP: reg34=1
 ; CHECK-DUMP-NOT: reg34=0
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll b/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
index fa689d2b9d7fd..f37f12246e24a 100644
--- a/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
+++ b/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
@@ -60,9 +60,9 @@ define i32 @leaf_sign_all(i32 %x) "branch-protection-pauth-lr" "sign-return-addr
 ; COMPAT-LABEL: leaf_sign_all:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp0:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    adrp x16, .Ltmp0
 ; COMPAT-NEXT:    add x16, x16, :lo12:.Ltmp0
 ; COMPAT-NEXT:    hint #39
@@ -72,9 +72,9 @@ define i32 @leaf_sign_all(i32 %x) "branch-protection-pauth-lr" "sign-return-addr
 ; V83A-LABEL: leaf_sign_all:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp0:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    adrp x16, .Ltmp0
 ; V83A-NEXT:    add x16, x16, :lo12:.Ltmp0
 ; V83A-NEXT:    hint #39
@@ -82,9 +82,9 @@ define i32 @leaf_sign_all(i32 %x) "branch-protection-pauth-lr" "sign-return-addr
 ;
 ; PAUTHLR-LABEL: leaf_sign_all:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp0:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp0
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp0
 ; PAUTHLR-NEXT:    retaasppc .Ltmp0
@@ -95,9 +95,9 @@ define i64 @leaf_clobbers_lr(i64 %x) "branch-protection-pauth-lr" "sign-return-a
 ; COMPAT-LABEL: leaf_clobbers_lr:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp1:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -114,9 +114,9 @@ define i64 @leaf_clobbers_lr(i64 %x) "branch-protection-pauth-lr" "sign-return-a
 ; V83A-LABEL: leaf_clobbers_lr:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp1:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -131,9 +131,9 @@ define i64 @leaf_clobbers_lr(i64 %x) "branch-protection-pauth-lr" "sign-return-a
 ;
 ; PAUTHLR-LABEL: leaf_clobbers_lr:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp1:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
 ; PAUTHLR-NEXT:    .cfi_offset w30, -16
@@ -154,9 +154,9 @@ define i32 @non_leaf_sign_all(i32 %x) "branch-protection-pauth-lr" "sign-return-
 ; COMPAT-LABEL: non_leaf_sign_all:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp2:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -171,9 +171,9 @@ define i32 @non_leaf_sign_all(i32 %x) "branch-protection-pauth-lr" "sign-return-
 ; V83A-LABEL: non_leaf_sign_all:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp2:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -186,9 +186,9 @@ define i32 @non_leaf_sign_all(i32 %x) "branch-protection-pauth-lr" "sign-return-
 ;
 ; PAUTHLR-LABEL: non_leaf_sign_all:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp2:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
 ; PAUTHLR-NEXT:    .cfi_offset w30, -16
@@ -205,9 +205,9 @@ define i32 @non_leaf_sign_non_leaf(i32 %x) "branch-protection-pauth-lr" "sign-re
 ; COMPAT-LABEL: non_leaf_sign_non_leaf:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp3:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -222,9 +222,9 @@ define i32 @non_leaf_sign_non_leaf(i32 %x) "branch-protection-pauth-lr" "sign-re
 ; V83A-LABEL: non_leaf_sign_non_leaf:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp3:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -237,9 +237,9 @@ define i32 @non_leaf_sign_non_leaf(i32 %x) "branch-protection-pauth-lr" "sign-re
 ;
 ; PAUTHLR-LABEL: non_leaf_sign_non_leaf:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp3:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
 ; PAUTHLR-NEXT:    .cfi_offset w30, -16
@@ -259,9 +259,9 @@ define i32 @non_leaf_scs(i32 %x) "branch-protection-pauth-lr" "sign-return-addre
 ; CHECK-NEXT:    str x30, [x18], #8
 ; CHECK-NEXT:    .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 //
 ; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:  .Ltmp4:
 ; CHECK-NEXT:    paciasp
-; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -278,9 +278,9 @@ define i32 @non_leaf_scs(i32 %x) "branch-protection-pauth-lr" "sign-return-addre
 ; PAUTHLR:       // %bb.0:
 ; PAUTHLR-NEXT:    str x30, [x18], #8
 ; PAUTHLR-NEXT:    .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 //
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp4:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
 ; PAUTHLR-NEXT:    .cfi_offset w30, -16
@@ -299,9 +299,9 @@ define i32 @leaf_sign_all_v83(i32 %x) "branch-protection-pauth-lr" "sign-return-
 ; CHECK-LABEL: leaf_sign_all_v83:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:  .Ltmp5:
 ; CHECK-NEXT:    paciasp
-; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:    adrp x16, .Ltmp5
 ; CHECK-NEXT:    add x16, x16, :lo12:.Ltmp5
 ; CHECK-NEXT:    hint #39
@@ -309,9 +309,9 @@ define i32 @leaf_sign_all_v83(i32 %x) "branch-protection-pauth-lr" "sign-return-
 ;
 ; PAUTHLR-LABEL: leaf_sign_all_v83:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp5:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp5
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp5
 ; PAUTHLR-NEXT:    retaasppc .Ltmp5
@@ -324,9 +324,9 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "branch-protection-pauth-lr"
 ; COMPAT-LABEL: spill_lr_and_tail_call:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp6:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -343,9 +343,9 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "branch-protection-pauth-lr"
 ; V83A-LABEL: spill_lr_and_tail_call:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp6:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -361,9 +361,9 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "branch-protection-pauth-lr"
 ;
 ; PAUTHLR-LABEL: spill_lr_and_tail_call:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp6:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
 ; PAUTHLR-NEXT:    .cfi_offset w30, -16
@@ -384,9 +384,9 @@ define i32 @leaf_sign_all_a_key(i32 %x) "branch-protection-pauth-lr" "sign-retur
 ; COMPAT-LABEL: leaf_sign_all_a_key:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp7:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    adrp x16, .Ltmp7
 ; COMPAT-NEXT:    add x16, x16, :lo12:.Ltmp7
 ; COMPAT-NEXT:    hint #39
@@ -396,9 +396,9 @@ define i32 @leaf_sign_all_a_key(i32 %x) "branch-protection-pauth-lr" "sign-retur
 ; V83A-LABEL: leaf_sign_all_a_key:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp7:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    adrp x16, .Ltmp7
 ; V83A-NEXT:    add x16, x16, :lo12:.Ltmp7
 ; V83A-NEXT:    hint #39
@@ -406,9 +406,9 @@ define i32 @leaf_sign_all_a_key(i32 %x) "branch-protection-pauth-lr" "sign-retur
 ;
 ; PAUTHLR-LABEL: leaf_sign_all_a_key:
 ; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp7:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp7
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp7
 ; PAUTHLR-NEXT:    retaasppc .Ltmp7
@@ -420,9 +420,9 @@ define i32 @leaf_sign_all_b_key(i32 %x) "branch-protection-pauth-lr" "sign-retur
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    .cfi_b_key_frame
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp8:
 ; COMPAT-NEXT:    hint #27
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    adrp x16, .Ltmp8
 ; COMPAT-NEXT:    add x16, x16, :lo12:.Ltmp8
 ; COMPAT-NEXT:    hint #39
@@ -433,9 +433,9 @@ define i32 @leaf_sign_all_b_key(i32 %x) "branch-protection-pauth-lr" "sign-retur
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp8:
 ; V83A-NEXT:    pacibsp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    adrp x16, .Ltmp8
 ; V83A-NEXT:    add x16, x16, :lo12:.Ltmp8
 ; V83A-NEXT:    hint #39
@@ -444,9 +444,9 @@ define i32 @leaf_sign_all_b_key(i32 %x) "branch-protection-pauth-lr" "sign-retur
 ; PAUTHLR-LABEL: leaf_sign_all_b_key:
 ; PAUTHLR:       // %bb.0:
 ; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp8:
 ; PAUTHLR-NEXT:    pacibsppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp8
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp8
 ; PAUTHLR-NEXT:    retabsppc .Ltmp8
@@ -458,9 +458,9 @@ define i32 @leaf_sign_all_v83_b_key(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    .cfi_b_key_frame
 ; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:  .Ltmp9:
 ; CHECK-NEXT:    pacibsp
-; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:    adrp x16, .Ltmp9
 ; CHECK-NEXT:    add x16, x16, :lo12:.Ltmp9
 ; CHECK-NEXT:    hint #39
@@ -469,9 +469,9 @@ define i32 @leaf_sign_all_v83_b_key(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; PAUTHLR-LABEL: leaf_sign_all_v83_b_key:
 ; PAUTHLR:       // %bb.0:
 ; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp9:
 ; PAUTHLR-NEXT:    pacibsppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp9
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp9
 ; PAUTHLR-NEXT:    retabsppc .Ltmp9
@@ -484,9 +484,9 @@ define i32 @leaf_sign_all_a_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    hint #34
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp10:
 ; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    adrp x16, .Ltmp10
 ; COMPAT-NEXT:    add x16, x16, :lo12:.Ltmp10
 ; COMPAT-NEXT:    hint #39
@@ -497,9 +497,9 @@ define i32 @leaf_sign_all_a_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    hint #34
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp10:
 ; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    adrp x16, .Ltmp10
 ; V83A-NEXT:    add x16, x16, :lo12:.Ltmp10
 ; V83A-NEXT:    hint #39
@@ -508,9 +508,9 @@ define i32 @leaf_sign_all_a_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; PAUTHLR-LABEL: leaf_sign_all_a_key_bti:
 ; PAUTHLR:       // %bb.0:
 ; PAUTHLR-NEXT:    bti c
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp10:
 ; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp10
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp10
 ; PAUTHLR-NEXT:    retaasppc .Ltmp10
@@ -524,9 +524,9 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; COMPAT-NEXT:    hint #34
 ; COMPAT-NEXT:    .cfi_b_key_frame
 ; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp11:
 ; COMPAT-NEXT:    hint #27
-; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:    adrp x16, .Ltmp11
 ; COMPAT-NEXT:    add x16, x16, :lo12:.Ltmp11
 ; COMPAT-NEXT:    hint #39
@@ -538,9 +538,9 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; V83A-NEXT:    hint #34
 ; V83A-NEXT:    .cfi_b_key_frame
 ; V83A-NEXT:    hint #39
+; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp11:
 ; V83A-NEXT:    pacibsp
-; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:    adrp x16, .Ltmp11
 ; V83A-NEXT:    add x16, x16, :lo12:.Ltmp11
 ; V83A-NEXT:    hint #39
@@ -550,9 +550,9 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ; PAUTHLR:       // %bb.0:
 ; PAUTHLR-NEXT:    bti c
 ; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp11:
 ; PAUTHLR-NEXT:    pacibsppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp11
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp11
 ; PAUTHLR-NEXT:    retabsppc .Ltmp11
@@ -566,9 +566,9 @@ define i32 @leaf_sign_all_v83_b_key_bti(i32 %x) "branch-protection-pauth-lr" "si
 ; CHECK-NEXT:    hint #34
 ; CHECK-NEXT:    .cfi_b_key_frame
 ; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:  .Ltmp12:
 ; CHECK-NEXT:    pacibsp
-; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:    adrp x16, .Ltmp12
 ; CHECK-NEXT:    add x16, x16, :lo12:.Ltmp12
 ; CHECK-NEXT:    hint #39
@@ -578,9 +578,9 @@ define i32 @leaf_sign_all_v83_b_key_bti(i32 %x) "branch-protection-pauth-lr" "si
 ; PAUTHLR:       // %bb.0:
 ; PAUTHLR-NEXT:    bti c
 ; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:  .Ltmp12:
 ; PAUTHLR-NEXT:    pacibsppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp12
 ; PAUTHLR-NEXT:    add x16, x16, :lo12:.Ltmp12
 ; PAUTHLR-NEXT:    retabsppc .Ltmp12
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address.ll b/llvm/test/CodeGen/AArch64/sign-return-address.ll
index dafe0d71ceb5f..e0ee0d84ab4f1 100644
--- a/llvm/test/CodeGen/AArch64/sign-return-address.ll
+++ b/llvm/test/CodeGen/AArch64/sign-return-address.ll
@@ -29,15 +29,15 @@ define i32 @leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
 define i32 @leaf_sign_all(i32 %x) "sign-return-address"="all" {
 ; COMPAT-LABEL: leaf_sign_all:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    hint #29
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    retaa
   ret i32 %x
 }
@@ -45,8 +45,8 @@ define i32 @leaf_sign_all(i32 %x) "sign-return-address"="all" {
 define i64 @leaf_clobbers_lr(i64 %x) "sign-return-address"="non-leaf"  {
 ; COMPAT-LABEL: leaf_clobbers_lr:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -59,8 +59,8 @@ define i64 @leaf_clobbers_lr(i64 %x) "sign-return-address"="non-leaf"  {
 ;
 ; V83A-LABEL: leaf_clobbers_lr:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -78,8 +78,8 @@ declare i32 @foo(i32)
 define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" {
 ; COMPAT-LABEL: non_leaf_sign_all:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -90,8 +90,8 @@ define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" {
 ;
 ; V83A-LABEL: non_leaf_sign_all:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -105,8 +105,8 @@ define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" {
 define i32 @non_leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
 ; COMPAT-LABEL: non_leaf_sign_non_leaf:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -117,8 +117,8 @@ define i32 @non_leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
 ;
 ; V83A-LABEL: non_leaf_sign_non_leaf:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -135,8 +135,8 @@ define i32 @non_leaf_scs(i32 %x) "sign-return-address"="non-leaf" shadowcallstac
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x30, [x18], #8
 ; CHECK-NEXT:    .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 //
-; CHECK-NEXT:    paciasp
 ; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    paciasp
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -152,8 +152,8 @@ define i32 @non_leaf_scs(i32 %x) "sign-return-address"="non-leaf" shadowcallstac
 define i32 @leaf_sign_all_v83(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" {
 ; CHECK-LABEL: leaf_sign_all_v83:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    paciasp
 ; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    paciasp
 ; CHECK-NEXT:    retaa
   ret i32 %x
 }
@@ -163,8 +163,8 @@ declare fastcc i64 @bar(i64)
 define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
 ; COMPAT-LABEL: spill_lr_and_tail_call:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; COMPAT-NEXT:    .cfi_def_cfa_offset 16
 ; COMPAT-NEXT:    .cfi_offset w30, -16
@@ -177,8 +177,8 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
 ;
 ; V83A-LABEL: spill_lr_and_tail_call:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; V83A-NEXT:    .cfi_def_cfa_offset 16
 ; V83A-NEXT:    .cfi_offset w30, -16
@@ -196,15 +196,15 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
 define i32 @leaf_sign_all_a_key(i32 %x) "sign-return-address"="all" "sign-return-address-key"="a_key" {
 ; COMPAT-LABEL: leaf_sign_all_a_key:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    hint #29
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all_a_key:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    retaa
   ret i32 %x
 }
@@ -213,16 +213,16 @@ define i32 @leaf_sign_all_b_key(i32 %x) "sign-return-address"="all" "sign-return
 ; COMPAT-LABEL: leaf_sign_all_b_key:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    .cfi_b_key_frame
-; COMPAT-NEXT:    hint #27
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #27
 ; COMPAT-NEXT:    hint #31
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all_b_key:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    retab
   ret i32 %x
 }
@@ -231,8 +231,8 @@ define i32 @leaf_sign_all_v83_b_key(i32 %x) "sign-return-address"="all" "target-
 ; CHECK-LABEL: leaf_sign_all_v83_b_key:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    .cfi_b_key_frame
-; CHECK-NEXT:    pacibsp
 ; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    pacibsp
 ; CHECK-NEXT:    retab
   ret i32 %x
 }
@@ -241,15 +241,15 @@ define i32 @leaf_sign_all_v83_b_key(i32 %x) "sign-return-address"="all" "target-
 define i32 @leaf_sign_all_a_key_bti(i32 %x) "sign-return-address"="all" "sign-return-address-key"="a_key" "branch-target-enforcement"{
 ; COMPAT-LABEL: leaf_sign_all_a_key_bti:
 ; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #25
 ; COMPAT-NEXT:    hint #29
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all_a_key_bti:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    paciasp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    paciasp
 ; V83A-NEXT:    retaa
   ret i32 %x
 }
@@ -259,16 +259,16 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "sign-return-address"="all" "sign-re
 ; COMPAT-LABEL: leaf_sign_all_b_key_bti:
 ; COMPAT:       // %bb.0:
 ; COMPAT-NEXT:    .cfi_b_key_frame
-; COMPAT-NEXT:    hint #27
 ; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    hint #27
 ; COMPAT-NEXT:    hint #31
 ; COMPAT-NEXT:    ret
 ;
 ; V83A-LABEL: leaf_sign_all_b_key_bti:
 ; V83A:       // %bb.0:
 ; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    pacibsp
 ; V83A-NEXT:    retab
   ret i32 %x
 }
@@ -278,8 +278,8 @@ define i32 @leaf_sign_all_v83_b_key_bti(i32 %x) "sign-return-address"="all" "tar
 ; CHECK-LABEL: leaf_sign_all_v83_b_key_bti:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    .cfi_b_key_frame
-; CHECK-NEXT:    pacibsp
 ; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    pacibsp
 ; CHECK-NEXT:    retab
   ret i32 %x
 }
diff --git a/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir b/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
index d2b063a057139..b2abff75880c9 100644
--- a/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
+++ b/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
@@ -25,8 +25,8 @@ alignment:       4
 tracksRegLiveness: true
 frameInfo:
   maxCallFrameSize: 0
-#CHECK:    frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 #CHECK:    frame-setup CFI_INSTRUCTION negate_ra_sign_state
+#CHECK:    frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
 #CHECK:    frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 body:             |
   bb.0.entry:
@@ -42,8 +42,8 @@ tracksRegLiveness: true
 frameInfo:
   maxCallFrameSize: 0
 #CHECK:    frame-setup EMITBKEY
-#CHECK:    frame-setup PACIBSP implicit-def $lr, implicit $lr, implicit $sp
 #CHECK:    frame-setup CFI_INSTRUCTION negate_ra_sign_state
+#CHECK:    frame-setup PACIBSP implicit-def $lr, implicit $lr, implicit $sp
 #CHECK:    frame-destroy AUTIBSP implicit-def $lr, implicit $lr, implicit $sp
 body:             |
   bb.0.entry:
@@ -59,8 +59,8 @@ tracksRegLiveness: true
 frameInfo:
   maxCallFrameSize: 0
 #CHECK:    frame-setup PACM
-#CHECK:    frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp, pre-instr-symbol <mcsymbol >
 #CHECK:    frame-setup CFI_INSTRUCTION negate_ra_sign_state_with_pc
+#CHECK:    frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp, pre-instr-symbol <mcsymbol >
 #CHECK:    frame-destroy PACM
 #CHECK:    frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
 body:             |

From 249698541caf6dcaf555856fe9cae8b3d688087f Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Mon, 6 Jan 2025 01:46:17 -0800
Subject: [PATCH 11/49] [clang-format] Add LT_RequiresExpression and
 LT_SimpleRequirement (#121681)

The new line types help to annotate */&/&& in simple requirements as
binary operators.

Fixes #121675.
---
 clang/lib/Format/TokenAnnotator.cpp           | 25 +++++++++++++----
 clang/lib/Format/TokenAnnotator.h             |  2 ++
 clang/unittests/Format/TokenAnnotatorTest.cpp | 27 ++++++++++---------
 3 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 945174ca9c586..e18e9a6fcd074 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -1582,7 +1582,10 @@ class AnnotatingParser {
         return false;
       break;
     case tok::l_brace:
-      if (Style.Language == FormatStyle::LK_TextProto) {
+      if (IsCpp) {
+        if (Tok->is(TT_RequiresExpressionLBrace))
+          Line.Type = LT_RequiresExpression;
+      } else if (Style.Language == FormatStyle::LK_TextProto) {
         FormatToken *Previous = Tok->getPreviousNonComment();
         if (Previous && Previous->isNot(TT_DictLiteral))
           Previous->setType(TT_SelectorName);
@@ -2024,8 +2027,11 @@ class AnnotatingParser {
       if (!consumeToken())
         return LT_Invalid;
     }
-    if (Line.Type == LT_AccessModifier)
-      return LT_AccessModifier;
+    if (const auto Type = Line.Type; Type == LT_AccessModifier ||
+                                     Type == LT_RequiresExpression ||
+                                     Type == LT_SimpleRequirement) {
+      return Type;
+    }
     if (KeywordVirtualFound)
       return LT_VirtualFunctionDecl;
     if (ImportStatement)
@@ -3102,8 +3108,10 @@ class AnnotatingParser {
       }
     }
 
-    if (!Scopes.empty() && Scopes.back() == ST_CompoundRequirement)
+    if (Line.Type == LT_SimpleRequirement ||
+        (!Scopes.empty() && Scopes.back() == ST_CompoundRequirement)) {
       return TT_BinaryOperator;
+    }
 
     return TT_PointerOrReference;
   }
@@ -3693,8 +3701,15 @@ void TokenAnnotator::annotate(AnnotatedLine &Line) {
 
   if (!Line.Children.empty()) {
     ScopeStack.push_back(ST_ChildBlock);
-    for (auto &Child : Line.Children)
+    const bool InRequiresExpression = Line.Type == LT_RequiresExpression;
+    for (auto &Child : Line.Children) {
+      if (InRequiresExpression &&
+          !Child->First->isOneOf(tok::kw_typename, tok::kw_requires,
+                                 TT_CompoundRequirementLBrace)) {
+        Child->Type = LT_SimpleRequirement;
+      }
       annotate(*Child);
+    }
     // ScopeStack can become empty if Child has an unmatched `}`.
     if (!ScopeStack.empty())
       ScopeStack.pop_back();
diff --git a/clang/lib/Format/TokenAnnotator.h b/clang/lib/Format/TokenAnnotator.h
index 1a250e94d97c5..fa15517042f25 100644
--- a/clang/lib/Format/TokenAnnotator.h
+++ b/clang/lib/Format/TokenAnnotator.h
@@ -33,6 +33,8 @@ enum LineType {
   LT_VirtualFunctionDecl,
   LT_ArrayOfStructInitializer,
   LT_CommentAbovePPDirective,
+  LT_RequiresExpression,
+  LT_SimpleRequirement,
 };
 
 enum ScopeType {
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index a5b2d09a9f704..0383780c2d84a 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -370,6 +370,21 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) {
   ASSERT_EQ(Tokens.size(), 18u) << Tokens;
   EXPECT_TOKEN(Tokens[8], tok::r_paren, TT_CastRParen);
   EXPECT_TOKEN(Tokens[11], tok::star, TT_BinaryOperator);
+
+  Tokens = annotate("template <typename T>\n"
+                    "concept C = requires(T a, T b) { a && b; };");
+  ASSERT_EQ(Tokens.size(), 24u) << Tokens;
+  EXPECT_TOKEN(Tokens[16], tok::l_brace, TT_RequiresExpressionLBrace);
+  EXPECT_TOKEN(Tokens[18], tok::ampamp, TT_BinaryOperator);
+
+  Tokens = annotate("template <typename T, typename V>\n"
+                    "concept CheckMultiplicableBy = requires(T a, V b) {\n"
+                    "  { a * b } -> std::same_as<T>;\n"
+                    "};");
+  ASSERT_EQ(Tokens.size(), 36u) << Tokens;
+  EXPECT_TOKEN(Tokens[19], tok::l_brace, TT_RequiresExpressionLBrace);
+  EXPECT_TOKEN(Tokens[20], tok::l_brace, TT_CompoundRequirementLBrace);
+  EXPECT_TOKEN(Tokens[22], tok::star, TT_BinaryOperator);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsUsesOfPlusAndMinus) {
@@ -1456,18 +1471,6 @@ TEST_F(TokenAnnotatorTest, UnderstandsRequiresExpressions) {
   EXPECT_TOKEN(Tokens[13], tok::l_brace, TT_RequiresExpressionLBrace);
 }
 
-TEST_F(TokenAnnotatorTest, CompoundRequirement) {
-  auto Tokens = annotate("template <typename T, typename V>\n"
-                         "concept CheckMultiplicableBy = requires(T a, V b) {\n"
-                         "  { a * b } -> std::same_as<T>;\n"
-                         "};");
-  ASSERT_EQ(Tokens.size(), 36u) << Tokens;
-
-  EXPECT_TOKEN(Tokens[19], tok::l_brace, TT_RequiresExpressionLBrace);
-  EXPECT_TOKEN(Tokens[20], tok::l_brace, TT_CompoundRequirementLBrace);
-  EXPECT_TOKEN(Tokens[22], tok::star, TT_BinaryOperator);
-}
-
 TEST_F(TokenAnnotatorTest, UnderstandsPragmaRegion) {
   // Everything after #pragma region should be ImplicitStringLiteral
   auto Tokens = annotate("#pragma region Foo(Bar: Hello)");

From 2934ba8df4abb57476787a26d9c13b1c2a4dd9c1 Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" <git@ozinenko.com>
Date: Mon, 6 Jan 2025 01:47:40 -0800
Subject: [PATCH 12/49] [mlir] flush output in transform.print (#121382)

Print operations are often used for debugging, immediately before the
compiler aborts. In such cases, it is sometimes possible that the output
isn't fully produced yet. Make sure it is by explicitly flushing the
output.
---
 mlir/lib/Dialect/Transform/IR/TransformOps.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index 106a794735090..798853a75441a 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -2840,6 +2840,7 @@ transform::PrintOp::apply(transform::TransformRewriter &rewriter,
     llvm::outs() << "top-level ]]]\n";
     state.getTopLevel()->print(llvm::outs(), printFlags);
     llvm::outs() << "\n";
+    llvm::outs().flush();
     return DiagnosedSilenceableFailure::success();
   }
 
@@ -2849,6 +2850,7 @@ transform::PrintOp::apply(transform::TransformRewriter &rewriter,
     llvm::outs() << "\n";
   }
 
+  llvm::outs().flush();
   return DiagnosedSilenceableFailure::success();
 }
 

From 63c6fedff8e40aa0bb0dd4d0688c40b27660060a Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Mon, 6 Jan 2025 12:57:49 +0300
Subject: [PATCH 13/49] [clang][NFC] Stop using atypical compiler arguments in
 C++ DR tests

---
 clang/test/CXX/drs/cwg0xx.cpp  |  2 +-
 clang/test/CXX/drs/cwg18xx.cpp | 18 +++++++++---------
 clang/test/CXX/drs/cwg21xx.cpp | 18 +++++++++---------
 clang/test/CXX/drs/cwg6xx.cpp  | 20 ++++++++++----------
 4 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/clang/test/CXX/drs/cwg0xx.cpp b/clang/test/CXX/drs/cwg0xx.cpp
index ca969dc5fcec1..e9de2347f4fbc 100644
--- a/clang/test/CXX/drs/cwg0xx.cpp
+++ b/clang/test/CXX/drs/cwg0xx.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98,cxx98-14 -fexceptions -fcxx-exceptions -pedantic-errors -Wno-bind-to-temporary-copy
+// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98,cxx98-14 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++11 %s -verify=expected,since-cxx11,cxx98-14,cxx11-14 -fexceptions -fcxx-exceptions -pedantic-errors -triple %itanium_abi_triple
 // RUN: %clang_cc1 -std=c++14 %s -verify=expected,since-cxx11,cxx98-14,cxx11-14 -fexceptions -fcxx-exceptions -pedantic-errors -triple %itanium_abi_triple
 // RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors -triple %itanium_abi_triple
diff --git a/clang/test/CXX/drs/cwg18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp
index e9993326bb058..e085b38eb3127 100644
--- a/clang/test/CXX/drs/cwg18xx.cpp
+++ b/clang/test/CXX/drs/cwg18xx.cpp
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,cxx98 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-20,cxx98-14,cxx11-17,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-20,since-cxx14,cxx98-14,cxx11-17,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-20,since-cxx14,since-cxx17,cxx11-17,since-cxx11,since-cxx14,cxx17 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-20,since-cxx14,since-cxx17,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx23,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx23,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-20,cxx98-14,cxx11-17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-20,since-cxx14,cxx98-14,cxx11-17,since-cxx11,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-20,since-cxx14,since-cxx17,cxx11-17,since-cxx11,since-cxx14,cxx17 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-20,since-cxx14,since-cxx17,since-cxx20,since-cxx11,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx23,since-cxx11,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx23,since-cxx11,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors
 
 #if __cplusplus == 199711L
 #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
@@ -582,9 +582,9 @@ void cwg1891() { // cwg1891: 4
   typedef decltype(a) A;
   typedef decltype(b) B;
 
-  static_assert(!__has_trivial_constructor(A), "");
+  static_assert(!__is_trivially_constructible(A), "");
   // since-cxx20-error@-1 {{failed}}
-  static_assert(!__has_trivial_constructor(B), "");
+  static_assert(!__is_trivially_constructible(B), "");
 
   // C++20 allows default construction for non-capturing lambdas (P0624R2).
   A x;
diff --git a/clang/test/CXX/drs/cwg21xx.cpp b/clang/test/CXX/drs/cwg21xx.cpp
index 3f558a065dd36..2f337afd0e867 100644
--- a/clang/test/CXX/drs/cwg21xx.cpp
+++ b/clang/test/CXX/drs/cwg21xx.cpp
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,cxx98 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 
 #if __cplusplus == 199711L
 #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
@@ -259,13 +259,13 @@ struct NonConstCopy {
   NonConstCopy &operator=(NonConstCopy &) = default;
 };
 
-static_assert(__has_trivial_copy(NonConstCopy), "");
+static_assert(__is_trivially_copyable(NonConstCopy), "");
 static_assert(__is_trivially_constructible(NonConstCopy, NonConstCopy &), "");
 static_assert(!__is_trivially_constructible(NonConstCopy, NonConstCopy), "");
 static_assert(!__is_trivially_constructible(NonConstCopy, const NonConstCopy &), "");
 static_assert(!__is_trivially_constructible(NonConstCopy, NonConstCopy &&), "");
 
-static_assert(__has_trivial_assign(NonConstCopy), "");
+static_assert(__is_trivially_assignable(NonConstCopy, NonConstCopy &), "");
 static_assert(__is_trivially_assignable(NonConstCopy &, NonConstCopy &), "");
 static_assert(!__is_trivially_assignable(NonConstCopy &, const NonConstCopy &), "");
 static_assert(!__is_trivially_assignable(NonConstCopy &, NonConstCopy), "");
diff --git a/clang/test/CXX/drs/cwg6xx.cpp b/clang/test/CXX/drs/cwg6xx.cpp
index c7ef9e1caffe8..3bd209799e419 100644
--- a/clang/test/CXX/drs/cwg6xx.cpp
+++ b/clang/test/CXX/drs/cwg6xx.cpp
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98-17,cxx98-14,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors -fno-spell-checking
-// RUN: %clang_cc1 -std=c++11 %s -verify=expected,cxx11-20,cxx98-17,cxx11-17,cxx98-14,since-cxx11,cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -fno-spell-checking
-// RUN: %clang_cc1 -std=c++14 %s -verify=expected,cxx11-20,cxx98-17,cxx11-17,cxx98-14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -fno-spell-checking
-// RUN: %clang_cc1 -std=c++17 %s -verify=expected,cxx11-20,cxx98-17,cxx11-17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -fno-spell-checking
-// RUN: %clang_cc1 -std=c++20 %s -verify=expected,cxx11-20,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -fno-spell-checking
-// RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -fno-spell-checking
-// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -fno-spell-checking
+// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98-17,cxx98-14,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++11 %s -verify=expected,cxx11-20,cxx98-17,cxx11-17,cxx98-14,since-cxx11,cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++14 %s -verify=expected,cxx11-20,cxx98-17,cxx11-17,cxx98-14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++17 %s -verify=expected,cxx11-20,cxx98-17,cxx11-17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 %s -verify=expected,cxx11-20,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 
 #if __cplusplus == 199711L
 #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
@@ -980,14 +980,14 @@ namespace cwg673 { // cwg673: yes
     class C *f();
     void f(class D *);
     enum { e = X<struct E>::n };
-    void g() { extern struct F *p; }
+    void g() { extern struct FF *p; }
   };
   B *b;
   C *c;
   D *d;
   E *e;
-  F *f;
-  // expected-error@-1 {{unknown type name 'F'}}
+  FF *ff;
+  // expected-error@-1 {{unknown type name 'FF'}}
 } // namespace cwg673
 
 namespace cwg674 { // cwg674: 8

From 3bb2a6ffdd9d3e633e72453cda1d08b38a7b635c Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Mon, 6 Jan 2025 09:59:32 +0000
Subject: [PATCH 14/49] Complex deinterleaving/single reductions build fix
 Reapply "Add support for single reductions in ComplexDeinterleavingPass
 (#112875)"  (#120441)

This reverts commit 76714be5fd4ace66dd9e19ce706c2e2149dd5716, fixing the
build failure that caused the revert.

The failure stemmed from the complex deinterleaving pass identifying a
series of add operations as a "complex to single reduction", so when it
tried to transform this erroneously identified pattern, it faulted. The
fix applied is to ensure that complex numbers (or patterns that match
them) are used throughout, by checking if there is a deinterleave node
amidst the graph.
---
 .../llvm/CodeGen/ComplexDeinterleavingPass.h  |    2 +
 .../lib/CodeGen/ComplexDeinterleavingPass.cpp |  290 ++++-
 .../Target/AArch64/AArch64ISelLowering.cpp    |   43 +-
 .../AArch64/complex-deinterleaving-cdot.ll    | 1136 +++++++++++++++++
 .../AArch64/complex-deinterleaving-crash.ll   |   89 ++
 5 files changed, 1535 insertions(+), 25 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll

diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
index 84a2673fecb5b..4383249658e60 100644
--- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
+++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
@@ -35,6 +35,7 @@ struct ComplexDeinterleavingPass
 enum class ComplexDeinterleavingOperation {
   CAdd,
   CMulPartial,
+  CDot,
   // The following 'operations' are used to represent internal states. Backends
   // are not expected to try and support these in any capacity.
   Deinterleave,
@@ -43,6 +44,7 @@ enum class ComplexDeinterleavingOperation {
   ReductionPHI,
   ReductionOperation,
   ReductionSelect,
+  ReductionSingle
 };
 
 enum class ComplexDeinterleavingRotation {
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index f3f7ea9407b46..aec8df962ffb7 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -108,6 +108,13 @@ static bool isNeg(Value *V);
 static Value *getNegOperand(Value *V);
 
 namespace {
+template <typename T, typename IterT>
+std::optional<T> findCommonBetweenCollections(IterT A, IterT B) {
+  auto Common = llvm::find_if(A, [B](T I) { return llvm::is_contained(B, I); });
+  if (Common != A.end())
+    return std::make_optional(*Common);
+  return std::nullopt;
+}
 
 class ComplexDeinterleavingLegacyPass : public FunctionPass {
 public:
@@ -144,6 +151,7 @@ struct ComplexDeinterleavingCompositeNode {
   friend class ComplexDeinterleavingGraph;
   using NodePtr = std::shared_ptr<ComplexDeinterleavingCompositeNode>;
   using RawNodePtr = ComplexDeinterleavingCompositeNode *;
+  bool OperandsValid = true;
 
 public:
   ComplexDeinterleavingOperation Operation;
@@ -160,7 +168,11 @@ struct ComplexDeinterleavingCompositeNode {
   SmallVector<RawNodePtr> Operands;
   Value *ReplacementNode = nullptr;
 
-  void addOperand(NodePtr Node) { Operands.push_back(Node.get()); }
+  void addOperand(NodePtr Node) {
+    if (!Node || !Node.get())
+      OperandsValid = false;
+    Operands.push_back(Node.get());
+  }
 
   void dump() { dump(dbgs()); }
   void dump(raw_ostream &OS) {
@@ -194,6 +206,8 @@ struct ComplexDeinterleavingCompositeNode {
       PrintNodeRef(Op);
     }
   }
+
+  bool areOperandsValid() { return OperandsValid; }
 };
 
 class ComplexDeinterleavingGraph {
@@ -293,7 +307,7 @@ class ComplexDeinterleavingGraph {
 
   NodePtr submitCompositeNode(NodePtr Node) {
     CompositeNodes.push_back(Node);
-    if (Node->Real && Node->Imag)
+    if (Node->Real)
       CachedResult[{Node->Real, Node->Imag}] = Node;
     return Node;
   }
@@ -327,6 +341,8 @@ class ComplexDeinterleavingGraph {
   ///      i: ai - br
   NodePtr identifyAdd(Instruction *Real, Instruction *Imag);
   NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag);
+  NodePtr identifyPartialReduction(Value *R, Value *I);
+  NodePtr identifyDotProduct(Value *Inst);
 
   NodePtr identifyNode(Value *R, Value *I);
 
@@ -396,6 +412,7 @@ class ComplexDeinterleavingGraph {
   /// * Deinterleave the final value outside of the loop and repurpose original
   /// reduction users
   void processReductionOperation(Value *OperationReplacement, RawNodePtr Node);
+  void processReductionSingle(Value *OperationReplacement, RawNodePtr Node);
 
 public:
   void dump() { dump(dbgs()); }
@@ -891,17 +908,163 @@ ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real,
 }
 
 ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) {
-  LLVM_DEBUG(dbgs() << "identifyNode on " << *R << " / " << *I << "\n");
-  assert(R->getType() == I->getType() &&
-         "Real and imaginary parts should not have different types");
+ComplexDeinterleavingGraph::identifyDotProduct(Value *V) {
+
+  if (!TL->isComplexDeinterleavingOperationSupported(
+          ComplexDeinterleavingOperation::CDot, V->getType())) {
+    LLVM_DEBUG(dbgs() << "Target doesn't support complex deinterleaving "
+                         "operation CDot with the type "
+                      << *V->getType() << "\n");
+    return nullptr;
+  }
+
+  auto *Inst = cast<Instruction>(V);
+  auto *RealUser = cast<Instruction>(*Inst->user_begin());
+
+  NodePtr CN =
+      prepareCompositeNode(ComplexDeinterleavingOperation::CDot, Inst, nullptr);
+
+  NodePtr ANode;
+
+  const Intrinsic::ID PartialReduceInt =
+      Intrinsic::experimental_vector_partial_reduce_add;
+
+  Value *AReal = nullptr;
+  Value *AImag = nullptr;
+  Value *BReal = nullptr;
+  Value *BImag = nullptr;
+  Value *Phi = nullptr;
+
+  auto UnwrapCast = [](Value *V) -> Value * {
+    if (auto *CI = dyn_cast<CastInst>(V))
+      return CI->getOperand(0);
+    return V;
+  };
+
+  auto PatternRot0 = m_Intrinsic<PartialReduceInt>(
+      m_Intrinsic<PartialReduceInt>(m_Value(Phi),
+                                    m_Mul(m_Value(BReal), m_Value(AReal))),
+      m_Neg(m_Mul(m_Value(BImag), m_Value(AImag))));
+
+  auto PatternRot270 = m_Intrinsic<PartialReduceInt>(
+      m_Intrinsic<PartialReduceInt>(
+          m_Value(Phi), m_Neg(m_Mul(m_Value(BReal), m_Value(AImag)))),
+      m_Mul(m_Value(BImag), m_Value(AReal)));
+
+  if (match(Inst, PatternRot0)) {
+    CN->Rotation = ComplexDeinterleavingRotation::Rotation_0;
+  } else if (match(Inst, PatternRot270)) {
+    CN->Rotation = ComplexDeinterleavingRotation::Rotation_270;
+  } else {
+    Value *A0, *A1;
+    // The rotations 90 and 180 share the same operation pattern, so inspect the
+    // order of the operands, identifying where the real and imaginary
+    // components of A go, to discern between the aforementioned rotations.
+    auto PatternRot90Rot180 = m_Intrinsic<PartialReduceInt>(
+        m_Intrinsic<PartialReduceInt>(m_Value(Phi),
+                                      m_Mul(m_Value(BReal), m_Value(A0))),
+        m_Mul(m_Value(BImag), m_Value(A1)));
+
+    if (!match(Inst, PatternRot90Rot180))
+      return nullptr;
+
+    A0 = UnwrapCast(A0);
+    A1 = UnwrapCast(A1);
+
+    // Test if A0 is real/A1 is imag
+    ANode = identifyNode(A0, A1);
+    if (!ANode) {
+      // Test if A0 is imag/A1 is real
+      ANode = identifyNode(A1, A0);
+      // Unable to identify operand components, thus unable to identify rotation
+      if (!ANode)
+        return nullptr;
+      CN->Rotation = ComplexDeinterleavingRotation::Rotation_90;
+      AReal = A1;
+      AImag = A0;
+    } else {
+      AReal = A0;
+      AImag = A1;
+      CN->Rotation = ComplexDeinterleavingRotation::Rotation_180;
+    }
+  }
+
+  AReal = UnwrapCast(AReal);
+  AImag = UnwrapCast(AImag);
+  BReal = UnwrapCast(BReal);
+  BImag = UnwrapCast(BImag);
+
+  VectorType *VTy = cast<VectorType>(V->getType());
+  Type *ExpectedOperandTy = VectorType::getSubdividedVectorType(VTy, 2);
+  if (AReal->getType() != ExpectedOperandTy)
+    return nullptr;
+  if (AImag->getType() != ExpectedOperandTy)
+    return nullptr;
+  if (BReal->getType() != ExpectedOperandTy)
+    return nullptr;
+  if (BImag->getType() != ExpectedOperandTy)
+    return nullptr;
+
+  if (Phi->getType() != VTy && RealUser->getType() != VTy)
+    return nullptr;
+
+  NodePtr Node = identifyNode(AReal, AImag);
+
+  // In the case that a node was identified to figure out the rotation, ensure
+  // that trying to identify a node with AReal and AImag post-unwrap results in
+  // the same node
+  if (ANode && Node != ANode) {
+    LLVM_DEBUG(
+        dbgs()
+        << "Identified node is different from previously identified node. "
+           "Unable to confidently generate a complex operation node\n");
+    return nullptr;
+  }
+
+  CN->addOperand(Node);
+  CN->addOperand(identifyNode(BReal, BImag));
+  CN->addOperand(identifyNode(Phi, RealUser));
+
+  return submitCompositeNode(CN);
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) {
+  // Partial reductions don't support non-vector types, so check these first
+  if (!isa<VectorType>(R->getType()) || !isa<VectorType>(I->getType()))
+    return nullptr;
+
+  auto CommonUser =
+      findCommonBetweenCollections<Value *>(R->users(), I->users());
+  if (!CommonUser)
+    return nullptr;
+
+  auto *IInst = dyn_cast<IntrinsicInst>(*CommonUser);
+  if (!IInst || IInst->getIntrinsicID() !=
+                    Intrinsic::experimental_vector_partial_reduce_add)
+    return nullptr;
+
+  if (NodePtr CN = identifyDotProduct(IInst))
+    return CN;
+
+  return nullptr;
+}
 
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) {
   auto It = CachedResult.find({R, I});
   if (It != CachedResult.end()) {
     LLVM_DEBUG(dbgs() << " - Folding to existing node\n");
     return It->second;
   }
 
+  if (NodePtr CN = identifyPartialReduction(R, I))
+    return CN;
+
+  bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I);
+  if (!IsReduction && R->getType() != I->getType())
+    return nullptr;
+
   if (NodePtr CN = identifySplat(R, I))
     return CN;
 
@@ -1427,12 +1590,20 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
   if (It != RootToNode.end()) {
     auto RootNode = It->second;
     assert(RootNode->Operation ==
-           ComplexDeinterleavingOperation::ReductionOperation);
+               ComplexDeinterleavingOperation::ReductionOperation ||
+           RootNode->Operation ==
+               ComplexDeinterleavingOperation::ReductionSingle);
     // Find out which part, Real or Imag, comes later, and only if we come to
     // the latest part, add it to OrderedRoots.
     auto *R = cast<Instruction>(RootNode->Real);
-    auto *I = cast<Instruction>(RootNode->Imag);
-    auto *ReplacementAnchor = R->comesBefore(I) ? I : R;
+    auto *I = RootNode->Imag ? cast<Instruction>(RootNode->Imag) : nullptr;
+
+    Instruction *ReplacementAnchor;
+    if (I)
+      ReplacementAnchor = R->comesBefore(I) ? I : R;
+    else
+      ReplacementAnchor = R;
+
     if (ReplacementAnchor != RootI)
       return false;
     OrderedRoots.push_back(RootI);
@@ -1523,7 +1694,6 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
     for (size_t j = i + 1; j < OperationInstruction.size(); ++j) {
       if (Processed[j])
         continue;
-
       auto *Real = OperationInstruction[i];
       auto *Imag = OperationInstruction[j];
       if (Real->getType() != Imag->getType())
@@ -1556,6 +1726,28 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
         break;
       }
     }
+
+    auto *Real = OperationInstruction[i];
+    // We want to check that we have 2 operands, but the function attributes
+    // being counted as operands bloats this value.
+    if (Real->getNumOperands() < 2)
+      continue;
+
+    RealPHI = ReductionInfo[Real].first;
+    ImagPHI = nullptr;
+    PHIsFound = false;
+    auto Node = identifyNode(Real->getOperand(0), Real->getOperand(1));
+    if (Node && PHIsFound) {
+      LLVM_DEBUG(
+          dbgs() << "Identified single reduction starting from instruction: "
+                 << *Real << "/" << *ReductionInfo[Real].second << "\n");
+      Processed[i] = true;
+      auto RootNode = prepareCompositeNode(
+          ComplexDeinterleavingOperation::ReductionSingle, Real, nullptr);
+      RootNode->addOperand(Node);
+      RootToNode[Real] = RootNode;
+      submitCompositeNode(RootNode);
+    }
   }
 
   RealPHI = nullptr;
@@ -1563,6 +1755,24 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
 }
 
 bool ComplexDeinterleavingGraph::checkNodes() {
+
+  bool FoundDeinterleaveNode = false;
+  for (NodePtr N : CompositeNodes) {
+    if (!N->areOperandsValid())
+      return false;
+    if (N->Operation == ComplexDeinterleavingOperation::Deinterleave)
+      FoundDeinterleaveNode = true;
+  }
+
+  // We need a deinterleave node in order to guarantee that we're working with
+  // complex numbers.
+  if (!FoundDeinterleaveNode) {
+    LLVM_DEBUG(
+        dbgs() << "Couldn't find a deinterleave node within the graph, cannot "
+                  "guarantee safety during graph transformation.\n");
+    return false;
+  }
+
   // Collect all instructions from roots to leaves
   SmallPtrSet<Instruction *, 16> AllInstructions;
   SmallVector<Instruction *, 8> Worklist;
@@ -1831,7 +2041,7 @@ ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) {
 ComplexDeinterleavingGraph::NodePtr
 ComplexDeinterleavingGraph::identifyPHINode(Instruction *Real,
                                             Instruction *Imag) {
-  if (Real != RealPHI || Imag != ImagPHI)
+  if (Real != RealPHI || (ImagPHI && Imag != ImagPHI))
     return nullptr;
 
   PHIsFound = true;
@@ -1926,6 +2136,16 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
 
   Value *ReplacementNode;
   switch (Node->Operation) {
+  case ComplexDeinterleavingOperation::CDot: {
+    Value *Input0 = ReplaceOperandIfExist(Node, 0);
+    Value *Input1 = ReplaceOperandIfExist(Node, 1);
+    Value *Accumulator = ReplaceOperandIfExist(Node, 2);
+    assert(!Input1 || (Input0->getType() == Input1->getType() &&
+                       "Node inputs need to be of the same type"));
+    ReplacementNode = TL->createComplexDeinterleavingIR(
+        Builder, Node->Operation, Node->Rotation, Input0, Input1, Accumulator);
+    break;
+  }
   case ComplexDeinterleavingOperation::CAdd:
   case ComplexDeinterleavingOperation::CMulPartial:
   case ComplexDeinterleavingOperation::Symmetric: {
@@ -1969,13 +2189,18 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
   case ComplexDeinterleavingOperation::ReductionPHI: {
     // If Operation is ReductionPHI, a new empty PHINode is created.
     // It is filled later when the ReductionOperation is processed.
+    auto *OldPHI = cast<PHINode>(Node->Real);
     auto *VTy = cast<VectorType>(Node->Real->getType());
     auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
     auto *NewPHI = PHINode::Create(NewVTy, 0, "", BackEdge->getFirstNonPHIIt());
-    OldToNewPHI[dyn_cast<PHINode>(Node->Real)] = NewPHI;
+    OldToNewPHI[OldPHI] = NewPHI;
     ReplacementNode = NewPHI;
     break;
   }
+  case ComplexDeinterleavingOperation::ReductionSingle:
+    ReplacementNode = replaceNode(Builder, Node->Operands[0]);
+    processReductionSingle(ReplacementNode, Node);
+    break;
   case ComplexDeinterleavingOperation::ReductionOperation:
     ReplacementNode = replaceNode(Builder, Node->Operands[0]);
     processReductionOperation(ReplacementNode, Node);
@@ -2000,6 +2225,38 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
   return ReplacementNode;
 }
 
+void ComplexDeinterleavingGraph::processReductionSingle(
+    Value *OperationReplacement, RawNodePtr Node) {
+  auto *Real = cast<Instruction>(Node->Real);
+  auto *OldPHI = ReductionInfo[Real].first;
+  auto *NewPHI = OldToNewPHI[OldPHI];
+  auto *VTy = cast<VectorType>(Real->getType());
+  auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
+
+  Value *Init = OldPHI->getIncomingValueForBlock(Incoming);
+
+  IRBuilder<> Builder(Incoming->getTerminator());
+
+  Value *NewInit = nullptr;
+  if (auto *C = dyn_cast<Constant>(Init)) {
+    if (C->isZeroValue())
+      NewInit = Constant::getNullValue(NewVTy);
+  }
+
+  if (!NewInit)
+    NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy,
+                                      {Init, Constant::getNullValue(VTy)});
+
+  NewPHI->addIncoming(NewInit, Incoming);
+  NewPHI->addIncoming(OperationReplacement, BackEdge);
+
+  auto *FinalReduction = ReductionInfo[Real].second;
+  Builder.SetInsertPoint(&*FinalReduction->getParent()->getFirstInsertionPt());
+
+  auto *AddReduce = Builder.CreateAddReduce(OperationReplacement);
+  FinalReduction->replaceAllUsesWith(AddReduce);
+}
+
 void ComplexDeinterleavingGraph::processReductionOperation(
     Value *OperationReplacement, RawNodePtr Node) {
   auto *Real = cast<Instruction>(Node->Real);
@@ -2059,8 +2316,13 @@ void ComplexDeinterleavingGraph::replaceNodes() {
       auto *RootImag = cast<Instruction>(RootNode->Imag);
       ReductionInfo[RootReal].first->removeIncomingValue(BackEdge);
       ReductionInfo[RootImag].first->removeIncomingValue(BackEdge);
-      DeadInstrRoots.push_back(cast<Instruction>(RootReal));
-      DeadInstrRoots.push_back(cast<Instruction>(RootImag));
+      DeadInstrRoots.push_back(RootReal);
+      DeadInstrRoots.push_back(RootImag);
+    } else if (RootNode->Operation ==
+               ComplexDeinterleavingOperation::ReductionSingle) {
+      auto *RootInst = cast<Instruction>(RootNode->Real);
+      ReductionInfo[RootInst].first->removeIncomingValue(BackEdge);
+      DeadInstrRoots.push_back(ReductionInfo[RootInst].second);
     } else {
       assert(R && "Unable to find replacement for RootInstruction");
       DeadInstrRoots.push_back(RootInstruction);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 070163a5fb297..c965659b0fef1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29654,9 +29654,16 @@ bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
 
   if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
     unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
+
+    if (Operation == ComplexDeinterleavingOperation::CDot)
+      return ScalarWidth == 32 || ScalarWidth == 64;
     return 8 <= ScalarWidth && ScalarWidth <= 64;
   }
 
+  // CDot is not supported outside of scalable/sve scopes
+  if (Operation == ComplexDeinterleavingOperation::CDot)
+    return false;
+
   return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
          ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
 }
@@ -29666,6 +29673,8 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
     ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
     Value *Accumulator) const {
   VectorType *Ty = cast<VectorType>(InputA->getType());
+  if (Accumulator == nullptr)
+    Accumulator = Constant::getNullValue(Ty);
   bool IsScalable = Ty->isScalableTy();
   bool IsInt = Ty->getElementType()->isIntegerTy();
 
@@ -29677,6 +29686,10 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
 
   if (TyWidth > 128) {
     int Stride = Ty->getElementCount().getKnownMinValue() / 2;
+    int AccStride = cast<VectorType>(Accumulator->getType())
+                        ->getElementCount()
+                        .getKnownMinValue() /
+                    2;
     auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
     auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
     auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
@@ -29686,25 +29699,26 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
         B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
     Value *LowerSplitAcc = nullptr;
     Value *UpperSplitAcc = nullptr;
-    if (Accumulator) {
-      LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
-      UpperSplitAcc =
-          B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
-    }
+    Type *FullTy = Ty;
+    FullTy = Accumulator->getType();
+    auto *HalfAccTy = VectorType::getHalfElementsVectorType(
+        cast<VectorType>(Accumulator->getType()));
+    LowerSplitAcc =
+        B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(0));
+    UpperSplitAcc =
+        B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(AccStride));
     auto *LowerSplitInt = createComplexDeinterleavingIR(
         B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
     auto *UpperSplitInt = createComplexDeinterleavingIR(
         B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
 
-    auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
-                                        B.getInt64(0));
-    return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
+    auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),
+                                        LowerSplitInt, B.getInt64(0));
+    return B.CreateInsertVector(FullTy, Result, UpperSplitInt,
+                                B.getInt64(AccStride));
   }
 
   if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
-    if (Accumulator == nullptr)
-      Accumulator = Constant::getNullValue(Ty);
-
     if (IsScalable) {
       if (IsInt)
         return B.CreateIntrinsic(
@@ -29756,6 +29770,13 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
     return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
   }
 
+  if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
+      IsScalable) {
+    return B.CreateIntrinsic(
+        Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
+        {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
+  }
+
   return nullptr;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
new file mode 100644
index 0000000000000..11cf4c31936d8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
@@ -0,0 +1,1136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s --check-prefix=CHECK-SVE2
+; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-SVE
+; RUN: opt -S --passes=complex-deinterleaving %s -o - | FileCheck %s --check-prefix=CHECK-NOSVE
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @cdotp_i8_rot0(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot0(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 0)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 0)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot0(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot0(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+define i32 @cdotp_i8_rot90(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot90(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 90)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 90)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot90(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot90(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.imag.ext
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.real.ext
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+define i32 @cdotp_i8_rot180(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot180(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 180)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 180)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot180(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot180(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+define i32 @cdotp_i8_rot270(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot270(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 270)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 270)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot270(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot270(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.imag.ext
+  %real.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %real.mul
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul.neg)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.real.ext
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+define i64 @cdotp_i16_rot0(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
+; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot0(
+; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 0)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 0)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i64 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot0(
+; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i64 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot0(
+; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
+  %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
+  %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
+  %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
+  %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
+  %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
+  %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
+  %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <vscale x 8 x i64> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
+  ret i64 %0
+}
+
+define i64 @cdotp_i16_rot90(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
+; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot90(
+; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 90)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 90)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i64 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot90(
+; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i64 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot90(
+; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
+  %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
+  %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
+  %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
+  %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
+  %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
+  %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.imag.ext
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
+  %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.real.ext
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
+  ret i64 %0
+}
+
+define i64 @cdotp_i16_rot180(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
+; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot180(
+; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 180)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 180)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i64 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot180(
+; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i64 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot180(
+; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
+  %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
+  %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
+  %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
+  %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
+  %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
+  %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
+  %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.imag.ext
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
+  ret i64 %0
+}
+
+define i64 @cdotp_i16_rot270(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
+; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot270(
+; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
+; CHECK-SVE2-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
+; CHECK-SVE2-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 270)
+; CHECK-SVE2-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 270)
+; CHECK-SVE2-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-SVE2-NEXT:    [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
+; CHECK-SVE2-NEXT:    ret i64 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot270(
+; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[REAL_MUL]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i64 [[TMP11]]
+;
+; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot270(
+; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[REAL_MUL]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i64 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
+  %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
+  %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
+  %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
+  %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
+  %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
+  %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.imag.ext
+  %real.mul.neg = sub <vscale x 8 x i64> zeroinitializer, %real.mul
+  %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul.neg)
+  %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.real.ext
+  %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
+  ret i64 %0
+}
+
+
+define i32 @not_cdotp(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @not_cdotp(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE2-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE2-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE2-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-SVE2-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-SVE2-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE2-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @not_cdotp(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-NOSVE-LABEL: define i32 @not_cdotp(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %real.mul
+  %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul.neg)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+define i16 @invalid_type(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i16 @invalid_type(
+; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE2-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE2-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE2-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE2-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE2-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE2-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE2-NEXT:    ret i16 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i16 @invalid_type(
+; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i16 [[TMP0]]
+;
+; CHECK-NOSVE-LABEL: define i16 @invalid_type(
+; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i16 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <vscale x 8 x i16> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
+  %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
+  %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+  %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+  %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+  %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+  %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> %vec.phi, <vscale x 16 x i32> %real.mul)
+  %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> %partial.reduce.sub)
+  ret i16 %0
+}
+
+define i32 @not_cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-SVE2-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length(
+; CHECK-SVE2-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE2-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE2:       [[VECTOR_BODY]]:
+; CHECK-SVE2-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE2-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]])
+; CHECK-SVE2-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]])
+; CHECK-SVE2-NEXT:    [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE2-NEXT:    [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE2-NEXT:    [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32>
+; CHECK-SVE2-NEXT:    [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32>
+; CHECK-SVE2-NEXT:    [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
+; CHECK-SVE2-NEXT:    [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
+; CHECK-SVE2-NEXT:    [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE2-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
+; CHECK-SVE2-NEXT:    [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE2-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE2-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE2-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE2:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE2-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-SVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length(
+; CHECK-SVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-SVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-SVE:       [[VECTOR_BODY]]:
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]])
+; CHECK-SVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]])
+; CHECK-SVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-SVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-SVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-SVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
+; CHECK-SVE-NEXT:    [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-SVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-SVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-SVE:       [[MIDDLE_BLOCK]]:
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-SVE-NEXT:    ret i32 [[TMP0]]
+;
+; CHECK-NOSVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length(
+; CHECK-NOSVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) {
+; CHECK-NOSVE-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOSVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NOSVE:       [[VECTOR_BODY]]:
+; CHECK-NOSVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NOSVE-NEXT:    [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]])
+; CHECK-NOSVE-NEXT:    [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]])
+; CHECK-NOSVE-NEXT:    [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0
+; CHECK-NOSVE-NEXT:    [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1
+; CHECK-NOSVE-NEXT:    [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32>
+; CHECK-NOSVE-NEXT:    [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
+; CHECK-NOSVE-NEXT:    [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
+; CHECK-NOSVE-NEXT:    [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
+; CHECK-NOSVE-NEXT:    [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
+; CHECK-NOSVE-NEXT:    [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
+; CHECK-NOSVE-NEXT:    [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
+; CHECK-NOSVE-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK-NOSVE:       [[MIDDLE_BLOCK]]:
+; CHECK-NOSVE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
+; CHECK-NOSVE-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
+  %a.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %a)
+  %b.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %b)
+  %a.real = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 1
+  %b.real = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 1
+  %a.real.ext = sext <16 x i8> %a.real to <16 x i32>
+  %a.imag.ext = sext <16 x i8> %a.imag to <16 x i32>
+  %b.real.ext = sext <16 x i8> %b.real to <16 x i32>
+  %b.imag.ext = sext <16 x i8> %b.imag to <16 x i32>
+  %real.mul = mul <16 x i32> %b.real.ext, %a.real.ext
+  %real.mul.reduced = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %vec.phi, <16 x i32> %real.mul)
+  %imag.mul = mul <16 x i32> %b.imag.ext, %a.imag.ext
+  %imag.mul.neg = sub <16 x i32> zeroinitializer, %imag.mul
+  %partial.reduce.sub = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %real.mul.reduced, <16 x i32> %imag.mul.neg)
+  br i1 true, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %partial.reduce.sub)
+  ret i32 %0
+}
+
+declare <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16>, <vscale x 16 x i32>)
+declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
+declare <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(<vscale x 2 x i64>, <vscale x 16 x i32>)
+
+declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+
+declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
+declare i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
index 68cb29f8f5c8f..7542e9c4b8f5b 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll
@@ -29,3 +29,92 @@ bb193:                                            ; preds = %bb173
   store volatile i32 0, ptr null, align 4
   unreachable
 }
+
+; Check that the deinterleaving pass doesn't try to transform isolated patterns without a relevant deinterleaving pattern
+define i32 @check_deinterleaving_has_deinterleave(ptr %a) {
+; CHECK-LABEL: check_deinterleaving_has_deinterleave:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    add x8, x0, #16
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    mov w9, #32 // =0x20
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-NEXT:    movi v6.2d, #0000000000000000
+; CHECK-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-NEXT:  .LBB1_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldp q17, q18, [x8, #-16]
+; CHECK-NEXT:    subs x9, x9, #32
+; CHECK-NEXT:    add x8, x8, #32
+; CHECK-NEXT:    cmeq v17.16b, v17.16b, #0
+; CHECK-NEXT:    cmeq v18.16b, v18.16b, #0
+; CHECK-NEXT:    ushll2 v19.8h, v17.16b, #0
+; CHECK-NEXT:    ushll v17.8h, v17.8b, #0
+; CHECK-NEXT:    ushll2 v20.8h, v18.16b, #0
+; CHECK-NEXT:    ushll v18.8h, v18.8b, #0
+; CHECK-NEXT:    ushll v21.4s, v19.4h, #0
+; CHECK-NEXT:    ushll2 v19.4s, v19.8h, #0
+; CHECK-NEXT:    ushll v22.4s, v17.4h, #0
+; CHECK-NEXT:    ushll2 v17.4s, v17.8h, #0
+; CHECK-NEXT:    ushll2 v23.4s, v20.8h, #0
+; CHECK-NEXT:    ushll v24.4s, v18.4h, #0
+; CHECK-NEXT:    ushll2 v18.4s, v18.8h, #0
+; CHECK-NEXT:    ushll v20.4s, v20.4h, #0
+; CHECK-NEXT:    and v21.16b, v21.16b, v1.16b
+; CHECK-NEXT:    and v19.16b, v19.16b, v1.16b
+; CHECK-NEXT:    and v22.16b, v22.16b, v1.16b
+; CHECK-NEXT:    and v17.16b, v17.16b, v1.16b
+; CHECK-NEXT:    and v23.16b, v23.16b, v1.16b
+; CHECK-NEXT:    and v24.16b, v24.16b, v1.16b
+; CHECK-NEXT:    and v18.16b, v18.16b, v1.16b
+; CHECK-NEXT:    and v20.16b, v20.16b, v1.16b
+; CHECK-NEXT:    add v4.4s, v4.4s, v19.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v21.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v22.4s
+; CHECK-NEXT:    add v3.4s, v3.4s, v17.4s
+; CHECK-NEXT:    add v16.4s, v16.4s, v23.4s
+; CHECK-NEXT:    add v5.4s, v5.4s, v24.4s
+; CHECK-NEXT:    add v6.4s, v6.4s, v20.4s
+; CHECK-NEXT:    add v7.4s, v7.4s, v18.4s
+; CHECK-NEXT:    b.ne .LBB1_1
+; CHECK-NEXT:  // %bb.2: // %middle.block
+; CHECK-NEXT:    add v1.4s, v7.4s, v3.4s
+; CHECK-NEXT:    add v3.4s, v16.4s, v4.4s
+; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-NEXT:    add v2.4s, v6.4s, v2.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+entry:
+    br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %9, %vector.body ]
+  %vec.phi50 = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
+  %next.gep = getelementptr i8, ptr %a, i64 %index
+  %4 = getelementptr i8, ptr %next.gep, i64 16
+  %wide.load = load <16 x i8>, ptr %next.gep, align 1
+  %wide.load51 = load <16 x i8>, ptr %4, align 1
+  %5 = icmp eq <16 x i8> %wide.load, zeroinitializer
+  %6 = icmp eq <16 x i8> %wide.load51, zeroinitializer
+  %7 = zext <16 x i1> %5 to <16 x i32>
+  %8 = zext <16 x i1> %6 to <16 x i32>
+  %9 = add <16 x i32> %vec.phi, %7
+  %10 = add <16 x i32> %vec.phi50, %8
+  %index.next = add nuw i64 %index, 32
+  %11 = icmp eq i64 %index.next, 32
+  br i1 %11, label %middle.block, label %vector.body
+
+middle.block:
+  %bin.rdx = add <16 x i32> %10, %9
+  %12 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %bin.rdx)
+  ret i32 %12
+}

From 7de8dd8aa04c4051a4747545a2c233c87d110126 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Mon, 6 Jan 2025 10:26:51 +0000
Subject: [PATCH 15/49] [LoopVectorize][NFC] Fix arith-fp-frem-costs.ll test to
 use new vplan cost model (#120742)

---
 .../AArch64/arith-fp-frem-costs.ll            | 110 ++++++++----------
 1 file changed, 50 insertions(+), 60 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/arith-fp-frem-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/arith-fp-frem-costs.ll
index 63149adfa2158..d9b890788a129 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/arith-fp-frem-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/arith-fp-frem-costs.ll
@@ -1,67 +1,57 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "estimated cost.*frem" --version 4
-
+; REQUIRES: asserts
 ; RUN: opt -mattr=+neon -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=NEON-NO-VECLIB
-
 ; RUN: opt -mattr=+sve -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-NO-VECLIB
-
 ; RUN: opt -mattr=+neon -vector-library=ArmPL -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=NEON-ARMPL
-
 ; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=NEON-SLEEF
-
 ; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-ARMPL
-
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-SLEEF
-
 ; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-ARMPL-TAILFOLD
-
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -debug-only=loop-vectorize -disable-output -S < %s 2>&1 | FileCheck %s --check-prefix=SVE-SLEEF-TAILFOLD
 
-; REQUIRES: asserts
-
 target triple = "aarch64-unknown-linux-gnu"
 
 define void @frem_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; NEON-NO-VECLIB-LABEL: 'frem_f64'
 ; NEON-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem double %in, %in
+; NEON-NO-VECLIB:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-NO-VECLIB-LABEL: 'frem_f64'
 ; SVE-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-NO-VECLIB:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; NEON-ARMPL-LABEL: 'frem_f64'
 ; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
+; NEON-ARMPL:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; NEON-SLEEF-LABEL: 'frem_f64'
 ; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
+; NEON-SLEEF:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-ARMPL-LABEL: 'frem_f64'
 ; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-ARMPL:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of 10 for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-SLEEF-LABEL: 'frem_f64'
 ; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-SLEEF:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of 10 for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-ARMPL-TAILFOLD-LABEL: 'frem_f64'
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-ARMPL-TAILFOLD:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of 10 for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-SLEEF-TAILFOLD-LABEL: 'frem_f64'
 ; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
+; SVE-SLEEF-TAILFOLD:  Cost of 10 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of 10 for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
   entry:
   br label %for.body
@@ -84,58 +74,58 @@ define void @frem_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 define void @frem_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; NEON-NO-VECLIB-LABEL: 'frem_f32'
 ; NEON-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 52 for VF 4 For instruction: %res = frem float %in, %in
+; NEON-NO-VECLIB:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; NEON-NO-VECLIB:  Cost of 52 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-NO-VECLIB-LABEL: 'frem_f32'
 ; SVE-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 52 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-NO-VECLIB:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of 52 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-NO-VECLIB:  Cost of Invalid for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; NEON-ARMPL-LABEL: 'frem_f32'
 ; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; NEON-ARMPL:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
+; NEON-ARMPL:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; NEON-ARMPL:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; NEON-SLEEF-LABEL: 'frem_f32'
 ; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; NEON-SLEEF:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
+; NEON-SLEEF:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; NEON-SLEEF:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-ARMPL-LABEL: 'frem_f32'
 ; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-ARMPL:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL:  Cost of 10 for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-SLEEF-LABEL: 'frem_f32'
 ; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-SLEEF:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF:  Cost of 10 for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-ARMPL-TAILFOLD-LABEL: 'frem_f32'
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-ARMPL-TAILFOLD:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-ARMPL-TAILFOLD:  Cost of 10 for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
 ; SVE-SLEEF-TAILFOLD-LABEL: 'frem_f32'
 ; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
+; SVE-SLEEF-TAILFOLD:  Cost of 24 for VF 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of 10 for VF 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of Invalid for VF vscale x 1: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of Invalid for VF vscale x 2: WIDEN ir<%res> = frem ir<%in>, ir<%in>
+; SVE-SLEEF-TAILFOLD:  Cost of 10 for VF vscale x 4: WIDEN ir<%res> = frem ir<%in>, ir<%in>
 ;
   entry:
   br label %for.body

From cd356ffd3fcf088b2c06e29545dc7dfb3c1a61c1 Mon Sep 17 00:00:00 2001
From: Dhruv Srivastava <dhruv.srivastava@ibm.com>
Date: Mon, 6 Jan 2025 15:57:48 +0530
Subject: [PATCH 16/49] [lldb][AIX] HostInfoAIX Support  (#117906)

This PR is in reference to porting LLDB on AIX.

Link to discussions on llvm discourse and github:

1. https://discourse.llvm.org/t/port-lldb-to-ibm-aix/80640
2. https://github.com/llvm/llvm-project/issues/101657
The complete changes for porting are present in this draft PR:
https://github.com/llvm/llvm-project/pull/102601

Added a HostInfoAIX file for the AIX platform.
Most of the common functionalities are handled by the parent
HostInfoPosix now,
So we just have some basic functions implemented here.
---
 lldb/include/lldb/Host/aix/HostInfoAIX.h | 28 ++++++++++++++++++++++++
 lldb/source/Host/CMakeLists.txt          |  5 +++++
 lldb/source/Host/aix/HostInfoAIX.cpp     | 22 +++++++++++++++++++
 3 files changed, 55 insertions(+)
 create mode 100644 lldb/include/lldb/Host/aix/HostInfoAIX.h
 create mode 100644 lldb/source/Host/aix/HostInfoAIX.cpp

diff --git a/lldb/include/lldb/Host/aix/HostInfoAIX.h b/lldb/include/lldb/Host/aix/HostInfoAIX.h
new file mode 100644
index 0000000000000..7796a15237805
--- /dev/null
+++ b/lldb/include/lldb/Host/aix/HostInfoAIX.h
@@ -0,0 +1,28 @@
+//===-- HostInfoAIX.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_HOST_AIX_HOSTINFOAIX_H_
+#define LLDB_HOST_AIX_HOSTINFOAIX_H_
+
+#include "lldb/Host/posix/HostInfoPosix.h"
+#include "lldb/Utility/FileSpec.h"
+
+namespace lldb_private {
+
+class HostInfoAIX : public HostInfoPosix {
+  friend class HostInfoBase;
+
+public:
+  static void Initialize(SharedLibraryDirectoryHelper *helper = nullptr);
+  static void Terminate();
+
+  static FileSpec GetProgramFileSpec();
+};
+} // namespace lldb_private
+
+#endif // LLDB_HOST_AIX_HOSTINFOAIX_H_
diff --git a/lldb/source/Host/CMakeLists.txt b/lldb/source/Host/CMakeLists.txt
index c2e091ee8555b..e0cd8569bf957 100644
--- a/lldb/source/Host/CMakeLists.txt
+++ b/lldb/source/Host/CMakeLists.txt
@@ -133,6 +133,11 @@ else()
       openbsd/Host.cpp
       openbsd/HostInfoOpenBSD.cpp
       )
+
+  elseif (CMAKE_SYSTEM_NAME MATCHES "AIX")
+    add_host_subdirectory(aix
+      aix/HostInfoAIX.cpp
+      )    
   endif()
 endif()
 
diff --git a/lldb/source/Host/aix/HostInfoAIX.cpp b/lldb/source/Host/aix/HostInfoAIX.cpp
new file mode 100644
index 0000000000000..61b47462dd647
--- /dev/null
+++ b/lldb/source/Host/aix/HostInfoAIX.cpp
@@ -0,0 +1,22 @@
+//===-- HostInfoAIX.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/aix/HostInfoAIX.h"
+
+using namespace lldb_private;
+
+void HostInfoAIX::Initialize(SharedLibraryDirectoryHelper *helper) {
+  HostInfoPosix::Initialize(helper);
+}
+
+void HostInfoAIX::Terminate() { HostInfoBase::Terminate(); }
+
+FileSpec HostInfoAIX::GetProgramFileSpec() {
+  static FileSpec g_program_filespec;
+  return g_program_filespec;
+}

From 67055a79f0ccb961c3ccabec9c229b9dee5b160f Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 6 Jan 2025 10:29:47 +0000
Subject: [PATCH 17/49] [gn build] Port 3a7a9c928671

---
 llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn
index 7c9edfb016d7a..d74de409858b9 100644
--- a/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn
@@ -16,6 +16,7 @@ static_library("Host") {
   ]
   public_deps = [ "//llvm/utils/gn/build/libs/xml" ]
   sources = [
+    "aix/HostInfoAIX.cpp",
     "common/Alarm.cpp",
     "common/File.cpp",
     "common/FileAction.cpp",

From 6bee566dd34f87c89b33f46df161a846f145ae8f Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Mon, 6 Jan 2025 10:39:34 +0000
Subject: [PATCH 18/49] [LoopVectorize][NFC] Move "LV: Selecting VF" debug
 output (#120744)

Move the debug output that prints out the selected VF from
selectVectorizationFactor -> computeBestVF. This means that the output
will still be written even after removing the assert for the legacy and
vplan cost models matching.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0797100b182cb..e0f629e14f657 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4695,7 +4695,6 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
              << "LV: Vectorization seems to be not beneficial, "
              << "but was forced by a user.\n");
-  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
   return ChosenFactor;
 }
 #endif
@@ -7624,6 +7623,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
          "when vectorizing, the scalar cost must be computed.");
 #endif
 
+  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
   return BestFactor;
 }
 

From d3bd54bb7ee88926bad49bddd44c4bf68775b0e6 Mon Sep 17 00:00:00 2001
From: James Chesterman <James.Chesterman@arm.com>
Date: Mon, 6 Jan 2025 10:51:47 +0000
Subject: [PATCH 19/49] [AArch64][SVE] Add dot product codegen for partial
 reductions with no binary operation on input (#120207)

Add codegen for when the input type has 4 times as many elements as the
output type and the input to the partial reduction does not have a
binary operation performed on it.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  44 ++--
 .../neon-partial-reduce-dot-product.ll        | 248 ++++++++++++++++++
 .../AArch64/sve-partial-reduce-dot-product.ll | 208 +++++++++++++++
 3 files changed, 483 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c965659b0fef1..ef00b092fe5e0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21987,21 +21987,35 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
   SDLoc DL(N);
 
   SDValue Op2 = N->getOperand(2);
-  if (Op2->getOpcode() != ISD::MUL ||
-      !ISD::isExtOpcode(Op2->getOperand(0)->getOpcode()) ||
-      !ISD::isExtOpcode(Op2->getOperand(1)->getOpcode()))
-    return SDValue();
+  unsigned Op2Opcode = Op2->getOpcode();
+  SDValue MulOpLHS, MulOpRHS;
+  bool MulOpLHSIsSigned, MulOpRHSIsSigned;
+  if (ISD::isExtOpcode(Op2Opcode)) {
+    MulOpLHSIsSigned = MulOpRHSIsSigned = (Op2Opcode == ISD::SIGN_EXTEND);
+    MulOpLHS = Op2->getOperand(0);
+    MulOpRHS = DAG.getConstant(1, DL, MulOpLHS.getValueType());
+  } else if (Op2Opcode == ISD::MUL) {
+    SDValue ExtMulOpLHS = Op2->getOperand(0);
+    SDValue ExtMulOpRHS = Op2->getOperand(1);
+
+    unsigned ExtMulOpLHSOpcode = ExtMulOpLHS->getOpcode();
+    unsigned ExtMulOpRHSOpcode = ExtMulOpRHS->getOpcode();
+    if (!ISD::isExtOpcode(ExtMulOpLHSOpcode) ||
+        !ISD::isExtOpcode(ExtMulOpRHSOpcode))
+      return SDValue();
 
-  SDValue Acc = N->getOperand(1);
-  SDValue Mul = N->getOperand(2);
-  SDValue ExtMulOpLHS = Mul->getOperand(0);
-  SDValue ExtMulOpRHS = Mul->getOperand(1);
+    MulOpLHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
+    MulOpRHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
 
-  SDValue MulOpLHS = ExtMulOpLHS->getOperand(0);
-  SDValue MulOpRHS = ExtMulOpRHS->getOperand(0);
-  if (MulOpLHS.getValueType() != MulOpRHS.getValueType())
+    MulOpLHS = ExtMulOpLHS->getOperand(0);
+    MulOpRHS = ExtMulOpRHS->getOperand(0);
+
+    if (MulOpLHS.getValueType() != MulOpRHS.getValueType())
+      return SDValue();
+  } else
     return SDValue();
 
+  SDValue Acc = N->getOperand(1);
   EVT ReducedVT = N->getValueType(0);
   EVT MulSrcVT = MulOpLHS.getValueType();
 
@@ -22015,8 +22029,6 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
       !(ReducedVT == MVT::v2i32 && MulSrcVT == MVT::v8i8))
     return SDValue();
 
-  bool MulOpLHSIsSigned = ExtMulOpLHS->getOpcode() == ISD::SIGN_EXTEND;
-  bool MulOpRHSIsSigned = ExtMulOpRHS->getOpcode() == ISD::SIGN_EXTEND;
   // If the extensions are mixed, we should lower it to a usdot instead
   unsigned Opcode = 0;
   if (MulOpLHSIsSigned != MulOpRHSIsSigned) {
@@ -22032,10 +22044,8 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
     // USDOT expects the signed operand to be last
     if (!MulOpRHSIsSigned)
       std::swap(MulOpLHS, MulOpRHS);
-  } else if (MulOpLHSIsSigned)
-    Opcode = AArch64ISD::SDOT;
-  else
-    Opcode = AArch64ISD::UDOT;
+  } else
+    Opcode = MulOpLHSIsSigned ? AArch64ISD::SDOT : AArch64ISD::UDOT;
 
   // Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
   // product followed by a zero / sign extension
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index c1b9a4c9dbb79..9ece9edb84343 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -367,6 +367,166 @@ entry:
   ret <4 x i64> %partial.reduce
 }
 
+define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
+; CHECK-DOT-LABEL: udot_no_bin_op:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    udot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: udot_no_bin_op:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v2.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v2.4h
+; CHECK-NODOT-NEXT:    uaddw2 v2.4s, v3.4s, v2.8h
+; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
+; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-NODOT-NEXT:    ret
+  %a.wide = zext <16 x i8> %a to <16 x i32>
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
+  ret <4 x i32> %partial.reduce
+}
+
+define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
+; CHECK-DOT-LABEL: sdot_no_bin_op:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    sdot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: sdot_no_bin_op:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    sshll v2.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v2.4h
+; CHECK-NODOT-NEXT:    saddw2 v2.4s, v3.4s, v2.8h
+; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
+; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-NODOT-NEXT:    ret
+  %a.wide = sext <16 x i8> %a to <16 x i32>
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
+  ret <4 x i32> %partial.reduce
+}
+
+define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
+; CHECK-DOT-LABEL: udot_no_bin_op_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    udot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: udot_no_bin_op_narrow:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NODOT-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    uaddw v1.4s, v2.4s, v4.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    ret
+  %a.wide = zext <8 x i8> %a to <8 x i32>
+  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
+  ret <2 x i32> %partial.reduce
+}
+
+define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
+; CHECK-DOT-LABEL: sdot_no_bin_op_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    sdot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: sdot_no_bin_op_narrow:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NODOT-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    saddw v1.4s, v2.4s, v4.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    ret
+  %a.wide = sext <8 x i8> %a to <8 x i32>
+  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
+  ret <2 x i32> %partial.reduce
+}
+
+define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
+; CHECK-DOT-LABEL: udot_no_bin_op_8to64:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v3.16b, #1
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
+; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: udot_no_bin_op_8to64:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v3.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    ushll v4.4s, v3.4h, #0
+; CHECK-NODOT-NEXT:    ushll v5.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v4.4s
+; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-NODOT-NEXT:    uaddl2 v4.2d, v3.4s, v5.4s
+; CHECK-NODOT-NEXT:    uaddl v3.2d, v3.2s, v5.2s
+; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v2.4s
+; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v2.2s
+; CHECK-NODOT-NEXT:    add v1.2d, v4.2d, v1.2d
+; CHECK-NODOT-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-NODOT-NEXT:    ret
+  %a.wide = zext <16 x i8> %a to <16 x i64>
+  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
+  ret <4 x i64> %partial.reduce
+}
+
+define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
+; CHECK-DOT-LABEL: sdot_no_bin_op_8to64:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v3.16b, #1
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
+; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-NODOT-LABEL: sdot_no_bin_op_8to64:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    sshll v3.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    sshll v4.4s, v3.4h, #0
+; CHECK-NODOT-NEXT:    sshll v5.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
+; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-NODOT-NEXT:    saddl2 v4.2d, v3.4s, v5.4s
+; CHECK-NODOT-NEXT:    saddl v3.2d, v3.2s, v5.2s
+; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v2.4s
+; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v2.2s
+; CHECK-NODOT-NEXT:    add v1.2d, v4.2d, v1.2d
+; CHECK-NODOT-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-NODOT-NEXT:    ret
+  %a.wide = sext <16 x i8> %a to <16 x i64>
+  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
+  ret <4 x i64> %partial.reduce
+}
+
 define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 ; CHECK-LABEL: not_udot:
 ; CHECK:       // %bb.0:
@@ -398,3 +558,91 @@ define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) {
   %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <4 x i32> %mult)
   ret <2 x i32> %partial.reduce
 }
+
+define <2 x i64> @udot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
+; CHECK-LABEL: udot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    umull v5.2d, v1.2s, v2.2s
+; CHECK-NEXT:    umlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    umlal2 v5.2d, v3.4s, v4.4s
+; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = zext <8 x i16> %a to <8 x i64>
+  %b.wide = zext <8 x i8> %b to <8 x i64>
+  %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
+  ret <2 x i64> %partial.reduce
+}
+
+define <2 x i64> @sdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
+; CHECK-LABEL: sdot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    smull v5.2d, v1.2s, v2.2s
+; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    smlal2 v5.2d, v3.4s, v4.4s
+; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = sext <8 x i16> %a to <8 x i64>
+  %b.wide = sext <8 x i8> %b to <8 x i64>
+  %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
+  ret <2 x i64> %partial.reduce
+}
+
+define <2 x i64> @usdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
+; CHECK-LABEL: usdot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    smull v5.2d, v1.2s, v2.2s
+; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    smlal2 v5.2d, v3.4s, v4.4s
+; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = zext <8 x i16> %a to <8 x i64>
+  %b.wide = sext <8 x i8> %b to <8 x i64>
+  %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
+  ret <2 x i64> %partial.reduce
+}
+
+define <2 x i64> @sudot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
+; CHECK-LABEL: sudot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    smull v5.2d, v1.2s, v2.2s
+; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-NEXT:    smlal2 v5.2d, v3.4s, v4.4s
+; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = sext <8 x i16> %a to <8 x i64>
+  %b.wide = zext <8 x i8> %b to <8 x i64>
+  %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
+  ret <2 x i64> %partial.reduce
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index 66d6e0388bbf9..66f83c658ff4f 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -316,6 +316,84 @@ entry:
   ret <vscale x 4 x i64> %partial.reduce
 }
 
+define <vscale x 4 x i32> @udot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
+; CHECK-LABEL: udot_no_bin_op:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-NEXT:    ret
+  %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
+  ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 4 x i32> @sdot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
+; CHECK-LABEL: sdot_no_bin_op:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-NEXT:    ret
+  %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
+  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
+  ret <vscale x 4 x i32> %partial.reduce
+}
+
+define <vscale x 2 x i64> @udot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
+; CHECK-LABEL: udot_no_bin_op_wide:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
+  ret <vscale x 2 x i64> %partial.reduce
+}
+
+define <vscale x 2 x i64> @sdot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
+; CHECK-LABEL: sdot_no_bin_op_wide:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
+  ret <vscale x 2 x i64> %partial.reduce
+}
+
+define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
+; CHECK-LABEL: udot_no_bin_op_8to64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.b, #1 // =0x1
+; CHECK-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-NEXT:    sunpklo z2.d, z4.s
+; CHECK-NEXT:    sunpkhi z3.d, z4.s
+; CHECK-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-NEXT:    add z1.d, z1.d, z3.d
+; CHECK-NEXT:    ret
+  %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
+  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
+  ret <vscale x 4 x i64> %partial.reduce
+}
+
+define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
+; CHECK-LABEL: sdot_no_bin_op_8to64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z3.b, #1 // =0x1
+; CHECK-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-NEXT:    sunpklo z2.d, z4.s
+; CHECK-NEXT:    sunpkhi z3.d, z4.s
+; CHECK-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-NEXT:    add z1.d, z1.d, z3.d
+; CHECK-NEXT:    ret
+  %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
+  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
+  ret <vscale x 4 x i64> %partial.reduce
+}
+
 define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
 ; CHECK-LABEL: not_udot:
 ; CHECK:       // %bb.0: // %entry
@@ -419,3 +497,133 @@ entry:
   %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
   ret <vscale x 2 x i64> %partial.reduce
 }
+
+define <vscale x 2 x i64> @udot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
+; CHECK-LABEL: udot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-NEXT:    uunpklo z3.s, z1.h
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z4.s, z2.h
+; CHECK-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-NEXT:    uunpklo z5.d, z3.s
+; CHECK-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEXT:    uunpklo z7.d, z1.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z6.d, z4.s
+; CHECK-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-NEXT:    uunpklo z24.d, z2.s
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
+  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  ret <vscale x 2 x i64> %partial.reduce
+}
+
+define <vscale x 2 x i64> @sdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
+; CHECK-LABEL: sdot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    sunpklo z3.s, z1.h
+; CHECK-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sunpklo z5.d, z3.s
+; CHECK-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-NEXT:    sunpklo z7.d, z1.s
+; CHECK-NEXT:    sunpklo z4.s, z2.h
+; CHECK-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-NEXT:    sunpklo z6.d, z4.s
+; CHECK-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-NEXT:    sunpklo z24.d, z2.s
+; CHECK-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
+  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  ret <vscale x 2 x i64> %partial.reduce
+}
+
+define <vscale x 2 x i64> @usdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
+; CHECK-LABEL: usdot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    uunpklo z3.s, z1.h
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z5.d, z3.s
+; CHECK-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEXT:    uunpklo z7.d, z1.s
+; CHECK-NEXT:    sunpklo z4.s, z2.h
+; CHECK-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    sunpklo z6.d, z4.s
+; CHECK-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-NEXT:    sunpklo z24.d, z2.s
+; CHECK-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
+  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  ret <vscale x 2 x i64> %partial.reduce
+}
+
+define <vscale x 2 x i64> @sudot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
+; CHECK-LABEL: sudot_different_types:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-NEXT:    sunpklo z3.s, z1.h
+; CHECK-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z4.s, z2.h
+; CHECK-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-NEXT:    sunpklo z5.d, z3.s
+; CHECK-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-NEXT:    sunpklo z7.d, z1.s
+; CHECK-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z6.d, z4.s
+; CHECK-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-NEXT:    uunpklo z24.d, z2.s
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
+  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
+  ret <vscale x 2 x i64> %partial.reduce
+}

From 5436414cb229f73404f7a8609fddaae933b9be59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20=C3=81lvarez=20Ayll=C3=B3n?=
 <alejandro.alvarez@sonarsource.com>
Date: Mon, 6 Jan 2025 11:52:11 +0100
Subject: [PATCH 20/49] [clang] Do not serialize function definitions without a
 body (#121550)

An instantiated templated function definition may not have a body due to
parsing errors inside the templated function. When serializing, an
assert is triggered inside `ASTRecordWriter::AddFunctionDefinition`.

The instantiation may happen on an intermediate module.

The test case was reduced from `mp-units`.
---
 clang/lib/Serialization/ASTWriter.cpp         |  8 ++++
 clang/test/Modules/missing-body-in-import.cpp | 42 +++++++++++++++++++
 2 files changed, 50 insertions(+)
 create mode 100644 clang/test/Modules/missing-body-in-import.cpp

diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 4a6027943072c..7fa9322e7551f 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -7230,6 +7230,10 @@ void ASTWriter::CompletedImplicitDefinition(const FunctionDecl *D) {
   if (!D->isFromASTFile())
     return; // Declaration not imported from PCH.
 
+  // The function definition may not have a body due to parsing errors.
+  if (!D->doesThisDeclarationHaveABody())
+    return;
+
   // Implicit function decl from a PCH was defined.
   DeclUpdates[D].push_back(DeclUpdate(UPD_CXX_ADDED_FUNCTION_DEFINITION));
 }
@@ -7249,6 +7253,10 @@ void ASTWriter::FunctionDefinitionInstantiated(const FunctionDecl *D) {
   if (!D->isFromASTFile())
     return;
 
+  // The function definition may not have a body due to parsing errors.
+  if (!D->doesThisDeclarationHaveABody())
+    return;
+
   DeclUpdates[D].push_back(DeclUpdate(UPD_CXX_ADDED_FUNCTION_DEFINITION));
 }
 
diff --git a/clang/test/Modules/missing-body-in-import.cpp b/clang/test/Modules/missing-body-in-import.cpp
new file mode 100644
index 0000000000000..b52ebba15087a
--- /dev/null
+++ b/clang/test/Modules/missing-body-in-import.cpp
@@ -0,0 +1,42 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: cd %t
+
+// RUN: %clang_cc1 -std=c++23 mod1.cppm -emit-module-interface -o mod1.pcm -fallow-pcm-with-compiler-errors -verify
+// RUN: %clang_cc1 -std=c++23 mod2.cppm -emit-module-interface -o mod2.pcm -fmodule-file=mod1=mod1.pcm -verify -fallow-pcm-with-compiler-errors
+// RUN: %clang_cc1 -std=c++23 mod3.cppm -emit-module-interface -o mod3.pcm -fmodule-file=mod1=mod1.pcm -fmodule-file=mod2=mod2.pcm -verify -fallow-pcm-with-compiler-errors
+// RUN: %clang_cc1 -std=c++23 main.cpp -fmodule-file=mod1=mod1.pcm -fmodule-file=mod2=mod2.pcm -fmodule-file=mod3=mod3.pcm -verify -fallow-pcm-with-compiler-errors -ast-dump-all
+
+//--- mod1.cppm
+export module mod1;
+
+export template <unsigned N, unsigned M>
+class A {
+public:
+  constexpr A(const char[], const char[]) {
+    auto x = BrokenExpr; // expected-error {{use of undeclared identifier 'BrokenExpr'}}
+  }
+};
+
+export template<A<1,1> NTTP>
+struct B {};
+
+template < unsigned N, unsigned M >
+A(const char (&)[N], const char (&)[M]) -> A< 1, 1 >;
+
+//--- mod2.cppm
+export module mod2;
+import mod1;
+
+struct C: B <A{"a", "b"}> { // expected-error {{non-type template argument is not a constant expression}}
+  constexpr C(int a) { }
+};
+
+//--- mod3.cppm
+// expected-no-diagnostics
+export module mod3;
+export import mod2;
+
+//--- main.cpp
+// expected-no-diagnostics
+import mod3; // no crash

From 86d9040872e6d55dd34f8cfad62d129dc9f45fa1 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Mon, 6 Jan 2025 12:00:11 +0100
Subject: [PATCH 21/49] [mlir][GPU] Add NVVM-specific `cf.assert` lowering
 (#120431)

This commit add an NVIDIA-specific lowering of `cf.assert` to to
`__assertfail`.

Note: `getUniqueFormatGlobalName`, `getOrCreateFormatStringConstant` and
`getOrDefineFunction` are moved to `GPUOpsLowering.h`, so that they can
be reused.
---
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       |   1 +
 mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp    |   1 +
 .../ControlFlowToLLVM/ControlFlowToLLVM.h     |   4 +
 .../ControlFlowToLLVM/ControlFlowToLLVM.cpp   |   3 +-
 .../Conversion/GPUCommon/GPUOpsLowering.cpp   | 122 +++++++++---------
 .../lib/Conversion/GPUCommon/GPUOpsLowering.h |  21 +++
 .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp        | 101 ++++++++++++++-
 .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp      |   1 +
 .../Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp  |   1 +
 .../Conversion/GPUToNVVM/gpu-to-nvvm.mlir     |  29 +++++
 mlir/test/Integration/GPU/CUDA/assert.mlir    |  38 ++++++
 11 files changed, 258 insertions(+), 64 deletions(-)
 create mode 100644 mlir/test/Integration/GPU/CUDA/assert.mlir

diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 4edea86b417c3..5ba93fefab3f9 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -3928,6 +3928,7 @@ class FIRToLLVMLowering
     mlir::arith::populateArithToLLVMConversionPatterns(typeConverter, pattern);
     mlir::cf::populateControlFlowToLLVMConversionPatterns(typeConverter,
                                                           pattern);
+    mlir::cf::populateAssertToLLVMConversionPattern(typeConverter, pattern);
     // Math operations that have not been converted yet must be converted
     // to Libm.
     if (!isAMDGCN)
diff --git a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
index 3ad70e7279692..123d114ae1635 100644
--- a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
+++ b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
@@ -220,6 +220,7 @@ void ToyToLLVMLoweringPass::runOnOperation() {
   mlir::arith::populateArithToLLVMConversionPatterns(typeConverter, patterns);
   populateFinalizeMemRefToLLVMConversionPatterns(typeConverter, patterns);
   cf::populateControlFlowToLLVMConversionPatterns(typeConverter, patterns);
+  cf::populateAssertToLLVMConversionPattern(typeConverter, patterns);
   populateFuncToLLVMConversionPatterns(typeConverter, patterns);
 
   // The only remaining operation to lower from the `toy` dialect, is the
diff --git a/mlir/include/mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h b/mlir/include/mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h
index b88c1e8b20f32..88f18022da9bb 100644
--- a/mlir/include/mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h
+++ b/mlir/include/mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h
@@ -29,6 +29,10 @@ namespace cf {
 /// Collect the patterns to convert from the ControlFlow dialect to LLVM. The
 /// conversion patterns capture the LLVMTypeConverter by reference meaning the
 /// references have to remain alive during the entire pattern lifetime.
+///
+/// Note: This function does not populate the default cf.assert lowering. That
+/// is because some platforms have a custom cf.assert lowering. The default
+/// lowering can be populated with `populateAssertToLLVMConversionPattern`.
 void populateControlFlowToLLVMConversionPatterns(
     const LLVMTypeConverter &converter, RewritePatternSet &patterns);
 
diff --git a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
index 8672e7b849d9d..d0ffb94f3f96a 100644
--- a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
+++ b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
@@ -215,7 +215,6 @@ void mlir::cf::populateControlFlowToLLVMConversionPatterns(
     const LLVMTypeConverter &converter, RewritePatternSet &patterns) {
   // clang-format off
   patterns.add<
-      AssertOpLowering,
       BranchOpLowering,
       CondBranchOpLowering,
       SwitchOpLowering>(converter);
@@ -258,6 +257,7 @@ struct ConvertControlFlowToLLVM
     LLVMTypeConverter converter(ctx, options);
     RewritePatternSet patterns(ctx);
     mlir::cf::populateControlFlowToLLVMConversionPatterns(converter, patterns);
+    mlir::cf::populateAssertToLLVMConversionPattern(converter, patterns);
 
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns))))
@@ -286,6 +286,7 @@ struct ControlFlowToLLVMDialectInterface
       RewritePatternSet &patterns) const final {
     mlir::cf::populateControlFlowToLLVMConversionPatterns(typeConverter,
                                                           patterns);
+    mlir::cf::populateAssertToLLVMConversionPattern(typeConverter, patterns);
   }
 };
 } // namespace
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index b3c3fd4956d0b..544fc57949e24 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -19,6 +19,59 @@
 
 using namespace mlir;
 
+LLVM::LLVMFuncOp mlir::getOrDefineFunction(gpu::GPUModuleOp moduleOp,
+                                           Location loc, OpBuilder &b,
+                                           StringRef name,
+                                           LLVM::LLVMFunctionType type) {
+  LLVM::LLVMFuncOp ret;
+  if (!(ret = moduleOp.template lookupSymbol<LLVM::LLVMFuncOp>(name))) {
+    OpBuilder::InsertionGuard guard(b);
+    b.setInsertionPointToStart(moduleOp.getBody());
+    ret = b.create<LLVM::LLVMFuncOp>(loc, name, type, LLVM::Linkage::External);
+  }
+  return ret;
+}
+
+static SmallString<16> getUniqueSymbolName(gpu::GPUModuleOp moduleOp,
+                                           StringRef prefix) {
+  // Get a unique global name.
+  unsigned stringNumber = 0;
+  SmallString<16> stringConstName;
+  do {
+    stringConstName.clear();
+    (prefix + Twine(stringNumber++)).toStringRef(stringConstName);
+  } while (moduleOp.lookupSymbol(stringConstName));
+  return stringConstName;
+}
+
+LLVM::GlobalOp
+mlir::getOrCreateStringConstant(OpBuilder &b, Location loc,
+                                gpu::GPUModuleOp moduleOp, Type llvmI8,
+                                StringRef namePrefix, StringRef str,
+                                uint64_t alignment, unsigned addrSpace) {
+  llvm::SmallString<20> nullTermStr(str);
+  nullTermStr.push_back('\0'); // Null terminate for C
+  auto globalType =
+      LLVM::LLVMArrayType::get(llvmI8, nullTermStr.size_in_bytes());
+  StringAttr attr = b.getStringAttr(nullTermStr);
+
+  // Try to find existing global.
+  for (auto globalOp : moduleOp.getOps<LLVM::GlobalOp>())
+    if (globalOp.getGlobalType() == globalType && globalOp.getConstant() &&
+        globalOp.getValueAttr() == attr &&
+        globalOp.getAlignment().value_or(0) == alignment &&
+        globalOp.getAddrSpace() == addrSpace)
+      return globalOp;
+
+  // Not found: create new global.
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(moduleOp.getBody());
+  SmallString<16> name = getUniqueSymbolName(moduleOp, namePrefix);
+  return b.create<LLVM::GlobalOp>(loc, globalType,
+                                  /*isConstant=*/true, LLVM::Linkage::Internal,
+                                  name, attr, alignment, addrSpace);
+}
+
 LogicalResult
 GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
                                    ConversionPatternRewriter &rewriter) const {
@@ -328,61 +381,6 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
   return success();
 }
 
-static SmallString<16> getUniqueFormatGlobalName(gpu::GPUModuleOp moduleOp) {
-  const char formatStringPrefix[] = "printfFormat_";
-  // Get a unique global name.
-  unsigned stringNumber = 0;
-  SmallString<16> stringConstName;
-  do {
-    stringConstName.clear();
-    (formatStringPrefix + Twine(stringNumber++)).toStringRef(stringConstName);
-  } while (moduleOp.lookupSymbol(stringConstName));
-  return stringConstName;
-}
-
-/// Create an global that contains the given format string. If a global with
-/// the same format string exists already in the module, return that global.
-static LLVM::GlobalOp getOrCreateFormatStringConstant(
-    OpBuilder &b, Location loc, gpu::GPUModuleOp moduleOp, Type llvmI8,
-    StringRef str, uint64_t alignment = 0, unsigned addrSpace = 0) {
-  llvm::SmallString<20> formatString(str);
-  formatString.push_back('\0'); // Null terminate for C
-  auto globalType =
-      LLVM::LLVMArrayType::get(llvmI8, formatString.size_in_bytes());
-  StringAttr attr = b.getStringAttr(formatString);
-
-  // Try to find existing global.
-  for (auto globalOp : moduleOp.getOps<LLVM::GlobalOp>())
-    if (globalOp.getGlobalType() == globalType && globalOp.getConstant() &&
-        globalOp.getValueAttr() == attr &&
-        globalOp.getAlignment().value_or(0) == alignment &&
-        globalOp.getAddrSpace() == addrSpace)
-      return globalOp;
-
-  // Not found: create new global.
-  OpBuilder::InsertionGuard guard(b);
-  b.setInsertionPointToStart(moduleOp.getBody());
-  SmallString<16> name = getUniqueFormatGlobalName(moduleOp);
-  return b.create<LLVM::GlobalOp>(loc, globalType,
-                                  /*isConstant=*/true, LLVM::Linkage::Internal,
-                                  name, attr, alignment, addrSpace);
-}
-
-template <typename T>
-static LLVM::LLVMFuncOp getOrDefineFunction(T &moduleOp, const Location loc,
-                                            ConversionPatternRewriter &rewriter,
-                                            StringRef name,
-                                            LLVM::LLVMFunctionType type) {
-  LLVM::LLVMFuncOp ret;
-  if (!(ret = moduleOp.template lookupSymbol<LLVM::LLVMFuncOp>(name))) {
-    ConversionPatternRewriter::InsertionGuard guard(rewriter);
-    rewriter.setInsertionPointToStart(moduleOp.getBody());
-    ret = rewriter.create<LLVM::LLVMFuncOp>(loc, name, type,
-                                            LLVM::Linkage::External);
-  }
-  return ret;
-}
-
 LogicalResult GPUPrintfOpToHIPLowering::matchAndRewrite(
     gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
@@ -420,8 +418,8 @@ LogicalResult GPUPrintfOpToHIPLowering::matchAndRewrite(
   Value printfDesc = printfBeginCall.getResult();
 
   // Create the global op or find an existing one.
-  LLVM::GlobalOp global = getOrCreateFormatStringConstant(
-      rewriter, loc, moduleOp, llvmI8, adaptor.getFormat());
+  LLVM::GlobalOp global = getOrCreateStringConstant(
+      rewriter, loc, moduleOp, llvmI8, "printfFormat_", adaptor.getFormat());
 
   // Get a pointer to the format string's first element and pass it to printf()
   Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
@@ -502,9 +500,9 @@ LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite(
       getOrDefineFunction(moduleOp, loc, rewriter, "printf", printfType);
 
   // Create the global op or find an existing one.
-  LLVM::GlobalOp global = getOrCreateFormatStringConstant(
-      rewriter, loc, moduleOp, llvmI8, adaptor.getFormat(), /*alignment=*/0,
-      addressSpace);
+  LLVM::GlobalOp global = getOrCreateStringConstant(
+      rewriter, loc, moduleOp, llvmI8, "printfFormat_", adaptor.getFormat(),
+      /*alignment=*/0, addressSpace);
 
   // Get a pointer to the format string's first element
   Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
@@ -546,8 +544,8 @@ LogicalResult GPUPrintfOpToVPrintfLowering::matchAndRewrite(
       getOrDefineFunction(moduleOp, loc, rewriter, "vprintf", vprintfType);
 
   // Create the global op or find an existing one.
-  LLVM::GlobalOp global = getOrCreateFormatStringConstant(
-      rewriter, loc, moduleOp, llvmI8, adaptor.getFormat());
+  LLVM::GlobalOp global = getOrCreateStringConstant(
+      rewriter, loc, moduleOp, llvmI8, "printfFormat_", adaptor.getFormat());
 
   // Get a pointer to the format string's first element
   Value globalPtr = rewriter.create<LLVM::AddressOfOp>(loc, global);
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index 444a07a93ca36..e73a74845d2b6 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -14,6 +14,27 @@
 
 namespace mlir {
 
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+/// Find or create an external function declaration in the given module.
+LLVM::LLVMFuncOp getOrDefineFunction(gpu::GPUModuleOp moduleOp, Location loc,
+                                     OpBuilder &b, StringRef name,
+                                     LLVM::LLVMFunctionType type);
+
+/// Create a global that contains the given string. If a global with the same
+/// string already exists in the module, return that global.
+LLVM::GlobalOp getOrCreateStringConstant(OpBuilder &b, Location loc,
+                                         gpu::GPUModuleOp moduleOp, Type llvmI8,
+                                         StringRef namePrefix, StringRef str,
+                                         uint64_t alignment = 0,
+                                         unsigned addrSpace = 0);
+
+//===----------------------------------------------------------------------===//
+// Lowering Patterns
+//===----------------------------------------------------------------------===//
+
 /// Lowering for gpu.dynamic.shared.memory to LLVM dialect. The pattern first
 /// create a 0-sized global array symbol similar as LLVM expects. It constructs
 /// a memref descriptor with these values and return it.
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index e022d3ce6f636..2768929f460e2 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -25,6 +25,7 @@
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
@@ -236,6 +237,103 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
   }
 };
 
+/// Lowering of cf.assert into a conditional __assertfail.
+struct AssertOpToAssertfailLowering
+    : public ConvertOpToLLVMPattern<cf::AssertOp> {
+  using ConvertOpToLLVMPattern<cf::AssertOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(cf::AssertOp assertOp, cf::AssertOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    MLIRContext *ctx = rewriter.getContext();
+    Location loc = assertOp.getLoc();
+    Type i8Type = typeConverter->convertType(rewriter.getIntegerType(8));
+    Type i32Type = typeConverter->convertType(rewriter.getIntegerType(32));
+    Type i64Type = typeConverter->convertType(rewriter.getIntegerType(64));
+    Type ptrType = LLVM::LLVMPointerType::get(ctx);
+    Type voidType = LLVM::LLVMVoidType::get(ctx);
+
+    // Find or create __assertfail function declaration.
+    auto moduleOp = assertOp->getParentOfType<gpu::GPUModuleOp>();
+    auto assertfailType = LLVM::LLVMFunctionType::get(
+        voidType, {ptrType, ptrType, i32Type, ptrType, i64Type});
+    LLVM::LLVMFuncOp assertfailDecl = getOrDefineFunction(
+        moduleOp, loc, rewriter, "__assertfail", assertfailType);
+    assertfailDecl.setPassthroughAttr(
+        ArrayAttr::get(ctx, StringAttr::get(ctx, "noreturn")));
+
+    // Split blocks and insert conditional branch.
+    // ^before:
+    //   ...
+    //   cf.cond_br %condition, ^after, ^assert
+    // ^assert:
+    //   cf.assert
+    //   cf.br ^after
+    // ^after:
+    //   ...
+    Block *beforeBlock = assertOp->getBlock();
+    Block *assertBlock =
+        rewriter.splitBlock(beforeBlock, assertOp->getIterator());
+    Block *afterBlock =
+        rewriter.splitBlock(assertBlock, ++assertOp->getIterator());
+    rewriter.setInsertionPointToEnd(beforeBlock);
+    rewriter.create<cf::CondBranchOp>(loc, adaptor.getArg(), afterBlock,
+                                      assertBlock);
+    rewriter.setInsertionPointToEnd(assertBlock);
+    rewriter.create<cf::BranchOp>(loc, afterBlock);
+
+    // Continue cf.assert lowering.
+    rewriter.setInsertionPoint(assertOp);
+
+    // Populate file name, file number and function name from the location of
+    // the AssertOp.
+    StringRef fileName = "(unknown)";
+    StringRef funcName = "(unknown)";
+    int32_t fileLine = 0;
+    while (auto callSiteLoc = dyn_cast<CallSiteLoc>(loc))
+      loc = callSiteLoc.getCallee();
+    if (auto fileLineColLoc = dyn_cast<FileLineColRange>(loc)) {
+      fileName = fileLineColLoc.getFilename().strref();
+      fileLine = fileLineColLoc.getStartLine();
+    } else if (auto nameLoc = dyn_cast<NameLoc>(loc)) {
+      funcName = nameLoc.getName().strref();
+      if (auto fileLineColLoc =
+              dyn_cast<FileLineColRange>(nameLoc.getChildLoc())) {
+        fileName = fileLineColLoc.getFilename().strref();
+        fileLine = fileLineColLoc.getStartLine();
+      }
+    }
+
+    // Create constants.
+    auto getGlobal = [&](LLVM::GlobalOp global) {
+      // Get a pointer to the format string's first element.
+      Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
+          loc, LLVM::LLVMPointerType::get(ctx, global.getAddrSpace()),
+          global.getSymNameAttr());
+      Value start =
+          rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getGlobalType(),
+                                       globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
+      return start;
+    };
+    Value assertMessage = getGlobal(getOrCreateStringConstant(
+        rewriter, loc, moduleOp, i8Type, "assert_message_", assertOp.getMsg()));
+    Value assertFile = getGlobal(getOrCreateStringConstant(
+        rewriter, loc, moduleOp, i8Type, "assert_file_", fileName));
+    Value assertFunc = getGlobal(getOrCreateStringConstant(
+        rewriter, loc, moduleOp, i8Type, "assert_func_", funcName));
+    Value assertLine =
+        rewriter.create<LLVM::ConstantOp>(loc, i32Type, fileLine);
+    Value c1 = rewriter.create<LLVM::ConstantOp>(loc, i64Type, 1);
+
+    // Insert function call to __assertfail.
+    SmallVector<Value> arguments{assertMessage, assertFile, assertLine,
+                                 assertFunc, c1};
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(assertOp, assertfailDecl,
+                                              arguments);
+    return success();
+  }
+};
+
 /// Import the GPU Ops to NVVM Patterns.
 #include "GPUToNVVM.cpp.inc"
 
@@ -358,7 +456,8 @@ void mlir::populateGpuToNVVMConversionPatterns(
   using gpu::index_lowering::IndexKind;
   using gpu::index_lowering::IntrType;
   populateWithGenerated(patterns);
-  patterns.add<GPUPrintfOpToVPrintfLowering>(converter);
+  patterns.add<GPUPrintfOpToVPrintfLowering, AssertOpToAssertfailLowering>(
+      converter);
   patterns.add<
       gpu::index_lowering::OpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
                                       NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>>(
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index a1cefe289a696..afebded1c3ea4 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -296,6 +296,7 @@ struct LowerGpuOpsToROCDLOpsPass
     populateVectorToLLVMConversionPatterns(converter, llvmPatterns);
     populateMathToLLVMConversionPatterns(converter, llvmPatterns);
     cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);
+    cf::populateAssertToLLVMConversionPattern(converter, llvmPatterns);
     populateFuncToLLVMConversionPatterns(converter, llvmPatterns);
     populateFinalizeMemRefToLLVMConversionPatterns(converter, llvmPatterns);
     populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime);
diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
index 58fd3d565fce5..5d0003911bca8 100644
--- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
+++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
@@ -304,6 +304,7 @@ void ConvertOpenMPToLLVMPass::runOnOperation() {
   LLVMTypeConverter converter(&getContext());
   arith::populateArithToLLVMConversionPatterns(converter, patterns);
   cf::populateControlFlowToLLVMConversionPatterns(converter, patterns);
+  cf::populateAssertToLLVMConversionPattern(converter, patterns);
   populateFinalizeMemRefToLLVMConversionPatterns(converter, patterns);
   populateFuncToLLVMConversionPatterns(converter, patterns);
   populateOpenMPToLLVMConversionPatterns(converter, patterns);
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 748dfe8c68fc7..318f0f78efa5b 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -969,6 +969,35 @@ gpu.module @test_module_50 {
   }
 }
 
+// CHECK-LABEL: gpu.module @test_module_51
+//       CHECK:   llvm.mlir.global internal constant @[[func_name:.*]]("(unknown)\00") {addr_space = 0 : i32}
+//       CHECK:   llvm.mlir.global internal constant @[[file_name:.*]]("{{.*}}gpu-to-nvvm.mlir{{.*}}") {addr_space = 0 : i32}
+//       CHECK:   llvm.mlir.global internal constant @[[message:.*]]("assert message\00") {addr_space = 0 : i32}
+//       CHECK:   llvm.func @__assertfail(!llvm.ptr, !llvm.ptr, i32, !llvm.ptr, i64) attributes {passthrough = ["noreturn"]}
+//       CHECK:   llvm.func @test_assert(%[[cond:.*]]: i1) attributes {gpu.kernel, nvvm.kernel} {
+//       CHECK:     llvm.cond_br %[[cond]], ^[[after_block:.*]], ^[[assert_block:.*]]
+//       CHECK:   ^[[assert_block]]:
+//       CHECK:     %[[message_ptr:.*]] = llvm.mlir.addressof @[[message]] : !llvm.ptr
+//       CHECK:     %[[message_start:.*]] = llvm.getelementptr %[[message_ptr]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<15 x i8>
+//       CHECK:     %[[file_ptr:.*]] = llvm.mlir.addressof @[[file_name]] : !llvm.ptr
+//       CHECK:     %[[file_start:.*]] = llvm.getelementptr %[[file_ptr]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<{{.*}} x i8>
+//       CHECK:     %[[func_ptr:.*]] = llvm.mlir.addressof @[[func_name]] : !llvm.ptr
+//       CHECK:     %[[func_start:.*]] = llvm.getelementptr %[[func_ptr]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<{{.*}} x i8>
+//       CHECK:     %[[line_num:.*]] = llvm.mlir.constant({{.*}} : i32) : i32
+//       CHECK:     %[[ptr:.*]] = llvm.mlir.constant(1 : i64) : i64
+//       CHECK:     llvm.call @__assertfail(%[[message_start]], %[[file_start]], %[[line_num]], %[[func_start]], %[[ptr]]) : (!llvm.ptr, !llvm.ptr, i32, !llvm.ptr, i64) -> ()
+//       CHECK:     llvm.br ^[[after_block]]
+//       CHECK:   ^[[after_block]]:
+//       CHECK:     llvm.return
+//       CHECK:   }
+
+gpu.module @test_module_51 {
+  gpu.func @test_assert(%arg0: i1) kernel {
+    cf.assert %arg0, "assert message"
+    gpu.return
+  }
+}
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) {
     %gpu_module = transform.structured.match ops{["gpu.module"]} in %toplevel_module
diff --git a/mlir/test/Integration/GPU/CUDA/assert.mlir b/mlir/test/Integration/GPU/CUDA/assert.mlir
new file mode 100644
index 0000000000000..06a9c1ca0d114
--- /dev/null
+++ b/mlir/test/Integration/GPU/CUDA/assert.mlir
@@ -0,0 +1,38 @@
+// RUN: mlir-opt %s -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-cpu-runner \
+// RUN:   --shared-libs=%mlir_cuda_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void 2>&1 \
+// RUN: | FileCheck %s
+
+// CHECK-DAG: thread 0: print after passing assertion
+// CHECK-DAG: thread 1: print after passing assertion
+// CHECK-DAG: callee_file.cc:7: callee_func_name: block: [0,0,0], thread: [0,0,0] Assertion `failing assertion` failed.
+// CHECK-DAG: callee_file.cc:7: callee_func_name: block: [0,0,0], thread: [1,0,0] Assertion `failing assertion` failed.
+// CHECK-NOT: print after failing assertion
+
+module attributes {gpu.container_module} {
+gpu.module @kernels {
+gpu.func @test_assert(%c0: i1, %c1: i1) kernel {
+  %0 = gpu.thread_id x
+  cf.assert %c1, "passing assertion"
+  gpu.printf "thread %lld: print after passing assertion\n" %0 : index
+  // Test callsite(callsite(name)) location.
+  cf.assert %c0, "failing assertion" loc(callsite(callsite("callee_func_name"("callee_file.cc":7:9) at "caller_file.cc":10:8) at "caller2_file.cc":11:12))
+  gpu.printf "thread %lld: print after failing assertion\n" %0 : index
+  gpu.return
+}
+}
+
+func.func @main() {
+  %c2 = arith.constant 2 : index
+  %c1 = arith.constant 1 : index
+  %c0_i1 = arith.constant 0 : i1
+  %c1_i1 = arith.constant 1 : i1
+  gpu.launch_func @kernels::@test_assert
+      blocks in (%c1, %c1, %c1)
+      threads in (%c2, %c1, %c1)
+      args(%c0_i1 : i1, %c1_i1 : i1)
+  return
+}
+}

From 07b2d300f06da4ab8f92ba19d8b41144c23e11a5 Mon Sep 17 00:00:00 2001
From: Mats Petersson <mats.petersson@arm.com>
Date: Mon, 6 Jan 2025 11:02:31 +0000
Subject: [PATCH 22/49] [FLANG][OpenMP]Add support for ALIGN clause on OMP
 ALLOCATE (#120791)

This is trivially additional support for the existing ALLOCATE
directive, which allows an ALIGN clause.

The ALLOCATE directive is currently not implemented, so this is just
addding the necessary parser parts to allow the compiler to not say
"Huh? I don't get this" [or "Expected OpenMP construct"] when it
encounters the ALIGN clause.

Some parser testing is updated and a new todo test, just in case the
feature of align clause is not supported by the initial support for
ALLOCATE.
---
 flang/include/flang/Parser/dump-parse-tree.h  |  1 +
 flang/include/flang/Parser/parse-tree.h       |  5 +++
 flang/lib/Parser/openmp-parsers.cpp           |  4 ++
 flang/lib/Semantics/check-omp-structure.cpp   | 17 ++++++++
 flang/lib/Semantics/check-omp-structure.h     |  2 +
 .../Todo/omp-declarative-allocate-align.f90   | 10 +++++
 .../Parser/OpenMP/allocate-align-tree.f90     | 42 +++++++++++++++++++
 flang/test/Parser/OpenMP/allocate-unparse.f90 |  4 +-
 .../Semantics/OpenMP/allocate-align01.f90     | 20 +++++++++
 llvm/include/llvm/Frontend/OpenMP/OMP.td      |  1 +
 10 files changed, 105 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90
 create mode 100644 flang/test/Parser/OpenMP/allocate-align-tree.f90
 create mode 100644 flang/test/Semantics/OpenMP/allocate-align01.f90

diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index fa813727442f0..3331520922bc6 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -486,6 +486,7 @@ class ParseTreeDumper {
   NODE(parser, OmpAffinityClause)
   NODE(OmpAffinityClause, Modifier)
   NODE(parser, OmpAlignment)
+  NODE(parser, OmpAlignClause)
   NODE(parser, OmpAlignedClause)
   NODE(OmpAlignedClause, Modifier)
   NODE(parser, OmpAtClause)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index b693e001e5e4b..941d70d387629 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3760,6 +3760,11 @@ struct OmpAffinityClause {
   std::tuple<MODIFIERS(), OmpObjectList> t;
 };
 
+// Ref: 5.2: [174]
+struct OmpAlignClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpAlignClause, ScalarIntExpr);
+};
+
 // Ref: [4.5:72-81], [5.0:110-119], [5.1:134-143], [5.2:169-170]
 //
 // aligned-clause ->
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 75bb64d06ed0f..894c458a335b2 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -567,6 +567,8 @@ TYPE_PARSER(construct<OmpBindClause>(
     "TEAMS" >> pure(OmpBindClause::Binding::Teams) ||
     "THREAD" >> pure(OmpBindClause::Binding::Thread)))
 
+TYPE_PARSER(construct<OmpAlignClause>(scalarIntExpr))
+
 TYPE_PARSER(construct<OmpAtClause>(
     "EXECUTION" >> pure(OmpAtClause::ActionTime::Execution) ||
     "COMPILATION" >> pure(OmpAtClause::ActionTime::Compilation)))
@@ -582,6 +584,8 @@ TYPE_PARSER(
     "ACQ_REL" >> construct<OmpClause>(construct<OmpClause::AcqRel>()) ||
     "AFFINITY" >> construct<OmpClause>(construct<OmpClause::Affinity>(
                       parenthesized(Parser<OmpAffinityClause>{}))) ||
+    "ALIGN" >> construct<OmpClause>(construct<OmpClause::Align>(
+                   parenthesized(Parser<OmpAlignClause>{}))) ||
     "ALIGNED" >> construct<OmpClause>(construct<OmpClause::Aligned>(
                      parenthesized(Parser<OmpAlignedClause>{}))) ||
     "ALLOCATE" >> construct<OmpClause>(construct<OmpClause::Allocate>(
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 4c6a408a9ef30..6db43cf6f04bd 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1489,11 +1489,24 @@ void OmpStructureChecker::Leave(const parser::OpenMPRequiresConstruct &) {
   dirContext_.pop_back();
 }
 
+void OmpStructureChecker::CheckAlignValue(const parser::OmpClause &clause) {
+  if (auto *align{std::get_if<parser::OmpClause::Align>(&clause.u)}) {
+    if (const auto &v{GetIntValue(align->v)}; !v || *v <= 0) {
+      context_.Say(clause.source,
+          "The alignment value should be a constant positive integer"_err_en_US);
+    }
+  }
+}
+
 void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAllocate &x) {
   isPredefinedAllocator = true;
   const auto &dir{std::get<parser::Verbatim>(x.t)};
   const auto &objectList{std::get<parser::OmpObjectList>(x.t)};
   PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
+  const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
+  for (const auto &clause : clauseList.v) {
+    CheckAlignValue(clause);
+  }
   CheckIsVarPartOfAnotherVar(dir.source, objectList);
 }
 
@@ -1720,6 +1733,10 @@ void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) {
   const auto &dir{std::get<parser::Verbatim>(x.t)};
   const auto &objectList{std::get<std::optional<parser::OmpObjectList>>(x.t)};
   PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
+  const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
+  for (const auto &clause : clauseList.v) {
+    CheckAlignValue(clause);
+  }
   if (objectList) {
     CheckIsVarPartOfAnotherVar(dir.source, *objectList);
   }
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index f47c01c00499a..dc360957c873b 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -264,6 +264,8 @@ class OmpStructureChecker
   void CheckAllowedRequiresClause(llvmOmpClause clause);
   bool deviceConstructFound_{false};
 
+  void CheckAlignValue(const parser::OmpClause &);
+
   void EnterDirectiveNest(const int index) { directiveNest_[index]++; }
   void ExitDirectiveNest(const int index) { directiveNest_[index]--; }
   int GetDirectiveNest(const int index) { return directiveNest_[index]; }
diff --git a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90
new file mode 100644
index 0000000000000..d0ed0cbb4c831
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90
@@ -0,0 +1,10 @@
+! This test checks lowering of OpenMP allocate Directive with align clause.
+
+// RUN: not flang -fc1 -emit-fir -fopenmp %s 2>&1 | FileCheck %s
+
+program main
+  integer :: x
+
+  // CHECK: not yet implemented: OpenMPDeclarativeAllocate
+  !$omp allocate(x) align(32)
+end
diff --git a/flang/test/Parser/OpenMP/allocate-align-tree.f90 b/flang/test/Parser/OpenMP/allocate-align-tree.f90
new file mode 100644
index 0000000000000..8cb009dfe46c8
--- /dev/null
+++ b/flang/test/Parser/OpenMP/allocate-align-tree.f90
@@ -0,0 +1,42 @@
+! REQUIRES: openmp_runtime
+
+! RUN: %flang_fc1 %openmp_flags -fopenmp-version=51 -fdebug-dump-parse-tree %s | FileCheck %s
+! RUN: %flang_fc1 %openmp_flags -fdebug-unparse -fopenmp-version=51 %s | FileCheck %s --check-prefix="UNPARSE"
+! Ensures associated declarative OMP allocations are nested in their
+! corresponding executable allocate directive
+
+program allocate_align_tree
+    use omp_lib
+    integer, allocatable :: j(:), xarray(:)
+    integer :: z, t
+    t = 2
+    z = 3
+!$omp allocate(j) align(16)
+!$omp allocate(xarray) align(32) allocator(omp_large_cap_mem_alloc)
+    allocate(j(z), xarray(t))
+end program allocate_align_tree
+
+!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt
+!CHECK-NEXT: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
+!CHECK-NEXT: | | | AttrSpec -> Allocatable
+!CHECK-NEXT: | | | EntityDecl
+!CHECK-NEXT: | | | | Name = 'j'
+
+
+!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate
+!CHECK-NEXT: | | | Verbatim
+!CHECK-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
+!CHECK-NEXT: | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Expr = '32_4'
+!CHECK-NEXT: | | | | LiteralConstant -> IntLiteralConstant = '32'
+!CHECK-NEXT: | | | OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8'
+!CHECK-NEXT: | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc'
+!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
+!CHECK-NEXT: | | | | Verbatim
+!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'j'
+!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Expr = '16_4'
+!CHECK-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '16'
+!CHECK-NEXT: | | | AllocateStmt
+
+!UNPARSE: !$OMP ALLOCATE (j) ALIGN(16_4)
+!UNPARSE: !$OMP ALLOCATE (xarray) ALIGN(32_4) ALLOCATOR(2_8)
+!UNPARSE-NEXT: ALLOCATE(j(z), xarray(t))
diff --git a/flang/test/Parser/OpenMP/allocate-unparse.f90 b/flang/test/Parser/OpenMP/allocate-unparse.f90
index 81b3677ad954b..94bc2adf35ea9 100644
--- a/flang/test/Parser/OpenMP/allocate-unparse.f90
+++ b/flang/test/Parser/OpenMP/allocate-unparse.f90
@@ -5,7 +5,7 @@ program allocate_unparse
 use omp_lib
 
 real, dimension (:,:), allocatable :: darray
-integer :: a, b, m, n, t, x, y, z
+integer :: a, b, j, m, n, t, x, y, z
 
 ! 2.11.3 declarative allocate
 
@@ -25,6 +25,7 @@ program allocate_unparse
 !$omp allocate(z) allocator(omp_default_mem_alloc)
 !$omp allocate(m) allocator(omp_default_mem_alloc)
 !$omp allocate(n)
+!$omp allocate(j) align(16)
     allocate ( darray(z, t) )
 
 end program allocate_unparse
@@ -41,4 +42,5 @@ end program allocate_unparse
 !CHECK:!$OMP ALLOCATE (z) ALLOCATOR(omp_default_mem_alloc)
 !CHECK:!$OMP ALLOCATE (m) ALLOCATOR(omp_default_mem_alloc)
 !CHECK:!$OMP ALLOCATE (n)
+!CHECK:!$OMP ALLOCATE (j) ALIGN(16)
 !CHECK:ALLOCATE(darray(z,t))
diff --git a/flang/test/Semantics/OpenMP/allocate-align01.f90 b/flang/test/Semantics/OpenMP/allocate-align01.f90
new file mode 100644
index 0000000000000..ba0776cf46a6d
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/allocate-align01.f90
@@ -0,0 +1,20 @@
+! REQUIRES: openmp_runtime
+
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=51
+! OpenMP Version 5.2
+! The allocate clause's allocator modifier must be of type allocator_handle
+! and the align modifier must be constant, positive integer expression
+
+program allocate_align_tree
+    use omp_lib
+    integer, allocatable :: j(:), xarray(:)
+    integer :: z, t, xx
+    t = 2
+    z = 3
+    !ERROR: The alignment value should be a constant positive integer
+!$omp allocate(j) align(xx)
+    !ERROR: The alignment value should be a constant positive integer
+!$omp allocate(xarray) align(-32) allocator(omp_large_cap_mem_alloc)
+    allocate(j(z), xarray(t))
+end program allocate_align_tree
+
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index e36eb77cefe7e..a4c1964c3e88f 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -49,6 +49,7 @@ def OMPC_Affinity : Clause<"affinity"> {
 }
 def OMPC_Align : Clause<"align"> {
   let clangClass = "OMPAlignClause";
+  let flangClass = "OmpAlignClause";
 }
 def OMPC_Aligned : Clause<"aligned"> {
   let clangClass = "OMPAlignedClause";

From c35d382dad210a38c1582bd5d5c23592e9dfd9a7 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Mon, 6 Jan 2025 11:07:25 +0000
Subject: [PATCH 23/49] [clang] Add sincos builtin using `llvm.sincos`
 intrinsic (#114086)

This registers `sincos[f|l]` as a clang builtin and updates GCBuiltin to
emit the `llvm.sincos.*` intrinsic when `-fno-math-errno` is set. Note:
`llvm.sincos.*` is only emitted by `__builtin_sincos[f|l]` functions in
this initial patch.
---
 clang/include/clang/Basic/Builtins.td    | 13 +++++++
 clang/lib/CodeGen/CGBuiltin.cpp          | 40 +++++++++++++++++++++
 clang/test/CodeGen/AArch64/sincos.c      | 44 ++++++++++++++++++++++++
 clang/test/CodeGen/X86/math-builtins.c   | 36 ++++++++++++++++++-
 clang/test/OpenMP/declare_simd_aarch64.c |  4 +--
 5 files changed, 134 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/sincos.c

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index b5b47ae274601..468c16050e2bf 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -3568,6 +3568,19 @@ def Frexp : FPMathTemplate, LibBuiltin<"math.h"> {
   let AddBuiltinPrefixedAlias = 1;
 }
 
+def Sincos : FPMathTemplate, GNULibBuiltin<"math.h"> {
+  let Spellings = ["sincos"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(T, T*, T*)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def SincosF16F128 : F16F128MathTemplate, Builtin {
+  let Spellings = ["__builtin_sincos"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "void(T, T*, T*)";
+}
+
 def Ldexp : FPMathTemplate, LibBuiltin<"math.h"> {
   let Spellings = ["ldexp"];
   let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4d4b7428abd50..c419fb0cc055e 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -835,6 +835,38 @@ static Value *emitFrexpBuiltin(CodeGenFunction &CGF, const CallExpr *E,
   return CGF.Builder.CreateExtractValue(Call, 0);
 }
 
+static void emitSincosBuiltin(CodeGenFunction &CGF, const CallExpr *E,
+                              llvm::Intrinsic::ID IntrinsicID) {
+  llvm::Value *Val = CGF.EmitScalarExpr(E->getArg(0));
+  llvm::Value *Dest0 = CGF.EmitScalarExpr(E->getArg(1));
+  llvm::Value *Dest1 = CGF.EmitScalarExpr(E->getArg(2));
+
+  llvm::Function *F = CGF.CGM.getIntrinsic(IntrinsicID, {Val->getType()});
+  llvm::Value *Call = CGF.Builder.CreateCall(F, Val);
+
+  llvm::Value *SinResult = CGF.Builder.CreateExtractValue(Call, 0);
+  llvm::Value *CosResult = CGF.Builder.CreateExtractValue(Call, 1);
+
+  QualType DestPtrType = E->getArg(1)->getType()->getPointeeType();
+  LValue SinLV = CGF.MakeNaturalAlignAddrLValue(Dest0, DestPtrType);
+  LValue CosLV = CGF.MakeNaturalAlignAddrLValue(Dest1, DestPtrType);
+
+  llvm::StoreInst *StoreSin =
+      CGF.Builder.CreateStore(SinResult, SinLV.getAddress());
+  llvm::StoreInst *StoreCos =
+      CGF.Builder.CreateStore(CosResult, CosLV.getAddress());
+
+  // Mark the two stores as non-aliasing with each other. The order of stores
+  // emitted by this builtin is arbitrary, enforcing a particular order will
+  // prevent optimizations later on.
+  llvm::MDBuilder MDHelper(CGF.getLLVMContext());
+  MDNode *Domain = MDHelper.createAnonymousAliasScopeDomain();
+  MDNode *AliasScope = MDHelper.createAnonymousAliasScope(Domain);
+  MDNode *AliasScopeList = MDNode::get(Call->getContext(), AliasScope);
+  StoreSin->setMetadata(LLVMContext::MD_alias_scope, AliasScopeList);
+  StoreCos->setMetadata(LLVMContext::MD_noalias, AliasScopeList);
+}
+
 /// EmitFAbs - Emit a call to @llvm.fabs().
 static Value *EmitFAbs(CodeGenFunction &CGF, Value *V) {
   Function *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType());
@@ -3232,6 +3264,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
       return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
           *this, E, Intrinsic::sinh, Intrinsic::experimental_constrained_sinh));
 
+    case Builtin::BI__builtin_sincos:
+    case Builtin::BI__builtin_sincosf:
+    case Builtin::BI__builtin_sincosf16:
+    case Builtin::BI__builtin_sincosl:
+    case Builtin::BI__builtin_sincosf128:
+      emitSincosBuiltin(*this, E, Intrinsic::sincos);
+      return RValue::get(nullptr);
+
     case Builtin::BIsqrt:
     case Builtin::BIsqrtf:
     case Builtin::BIsqrtl:
diff --git a/clang/test/CodeGen/AArch64/sincos.c b/clang/test/CodeGen/AArch64/sincos.c
new file mode 100644
index 0000000000000..b77d98ceab486
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sincos.c
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -triple=aarch64-gnu-linux -emit-llvm -O1 %s -o - | FileCheck --check-prefix=NO-MATH-ERRNO %s
+// RUN: %clang_cc1 -triple=aarch64-gnu-linux -emit-llvm -fmath-errno %s -o - | FileCheck --check-prefix=MATH-ERRNO %s
+
+// NO-MATH-ERRNO-LABEL: @sincos_f32
+//      NO-MATH-ERRNO:    [[SINCOS:%.*]] = tail call { float, float } @llvm.sincos.f32(float {{.*}})
+// NO-MATH-ERRNO-NEXT:    [[SIN:%.*]] = extractvalue { float, float } [[SINCOS]], 0
+// NO-MATH-ERRNO-NEXT:    [[COS:%.*]] = extractvalue { float, float } [[SINCOS]], 1
+// NO-MATH-ERRNO-NEXT:    store float [[SIN]], ptr {{.*}}, align 4, !alias.scope [[SINCOS_ALIAS_SCOPE:![0-9]+]]
+// NO-MATH-ERRNO-NEXT:    store float [[COS]], ptr {{.*}}, align 4, !noalias [[SINCOS_ALIAS_SCOPE]]
+//
+// MATH-ERRNO-LABEL: @sincos_f32
+//      MATH-ERRNO:    call void @sincosf(
+//
+void sincos_f32(float x, float* fp0, float* fp1) {
+  __builtin_sincosf(x, fp0, fp1);
+}
+
+// NO-MATH-ERRNO-LABEL: @sincos_f64
+//      NO-MATH-ERRNO:    [[SINCOS:%.*]] = tail call { double, double } @llvm.sincos.f64(double {{.*}})
+// NO-MATH-ERRNO-NEXT:    [[SIN:%.*]] = extractvalue { double, double } [[SINCOS]], 0
+// NO-MATH-ERRNO-NEXT:    [[COS:%.*]] = extractvalue { double, double } [[SINCOS]], 1
+// NO-MATH-ERRNO-NEXT:    store double [[SIN]], ptr {{.*}}, align 8, !alias.scope [[SINCOS_ALIAS_SCOPE:![0-9]+]]
+// NO-MATH-ERRNO-NEXT:    store double [[COS]], ptr {{.*}}, align 8, !noalias [[SINCOS_ALIAS_SCOPE]]
+//
+// MATH-ERRNO-LABEL: @sincos_f64
+//      MATH-ERRNO:    call void @sincos(
+//
+void sincos_f64(double x, double* dp0, double* dp1) {
+  __builtin_sincos(x, dp0, dp1);
+}
+
+// NO-MATH-ERRNO-LABEL: @sincos_f128
+//      NO-MATH-ERRNO:    [[SINCOS:%.*]] = tail call { fp128, fp128 } @llvm.sincos.f128(fp128 {{.*}})
+// NO-MATH-ERRNO-NEXT:    [[SIN:%.*]] = extractvalue { fp128, fp128 } [[SINCOS]], 0
+// NO-MATH-ERRNO-NEXT:    [[COS:%.*]] = extractvalue { fp128, fp128 } [[SINCOS]], 1
+// NO-MATH-ERRNO-NEXT:    store fp128 [[SIN]], ptr {{.*}}, align 16, !alias.scope [[SINCOS_ALIAS_SCOPE:![0-9]+]]
+// NO-MATH-ERRNO-NEXT:    store fp128 [[COS]], ptr {{.*}}, align 16, !noalias [[SINCOS_ALIAS_SCOPE]]
+//
+// MATH-ERRNO-LABEL: @sincos_f128
+//      MATH-ERRNO:    call void @sincosl(
+//
+void sincos_f128(long double x, long double* ldp0, long double* ldp1) {
+  __builtin_sincosl(x, ldp0, ldp1);
+}
diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c
index bf107437fc63a..d7bf7d57fba26 100644
--- a/clang/test/CodeGen/X86/math-builtins.c
+++ b/clang/test/CodeGen/X86/math-builtins.c
@@ -38,6 +38,31 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 // NO__ERRNO-NEXT: [[FREXP_F128_0:%.+]] = extractvalue { fp128, i32 } [[FREXP_F128]], 0
 
 
+// NO__ERRNO: [[SINCOS_F64:%.+]] = call { double, double } @llvm.sincos.f64(double %{{.+}})
+// NO__ERRNO-NEXT: [[SINCOS_F64_0:%.+]] = extractvalue { double, double } [[SINCOS_F64]], 0
+// NO__ERRNO-NEXT: [[SINCOS_F64_1:%.+]] = extractvalue { double, double } [[SINCOS_F64]], 1
+// NO__ERRNO-NEXT: store double [[SINCOS_F64_0]], ptr %{{.+}}, align 8
+// NO__ERRNO-NEXT: store double [[SINCOS_F64_1]], ptr %{{.+}}, align 8
+
+// NO__ERRNO: [[SINCOS_F32:%.+]] = call { float, float } @llvm.sincos.f32(float %{{.+}})
+// NO__ERRNO-NEXT: [[SINCOS_F32_0:%.+]] = extractvalue { float, float } [[SINCOS_F32]], 0
+// NO__ERRNO-NEXT: [[SINCOS_F32_1:%.+]] = extractvalue { float, float } [[SINCOS_F32]], 1
+// NO__ERRNO-NEXT: store float [[SINCOS_F32_0]], ptr %{{.+}}, align 4
+// NO__ERRNO-NEXT: store float [[SINCOS_F32_1]], ptr %{{.+}}, align 4
+
+// NO__ERRNO: [[SINCOS_F80:%.+]] = call { x86_fp80, x86_fp80 } @llvm.sincos.f80(x86_fp80 %{{.+}})
+// NO__ERRNO-NEXT: [[SINCOS_F80_0:%.+]] = extractvalue { x86_fp80, x86_fp80 } [[SINCOS_F80]], 0
+// NO__ERRNO-NEXT: [[SINCOS_F80_1:%.+]] = extractvalue { x86_fp80, x86_fp80 } [[SINCOS_F80]], 1
+// NO__ERRNO-NEXT: store x86_fp80 [[SINCOS_F80_0]], ptr %{{.+}}, align 16
+// NO__ERRNO-NEXT: store x86_fp80 [[SINCOS_F80_1]], ptr %{{.+}}, align 16
+
+// NO__ERRNO: [[SINCOS_F128:%.+]] = call { fp128, fp128 } @llvm.sincos.f128(fp128 %{{.+}})
+// NO__ERRNO-NEXT: [[SINCOS_F128_0:%.+]] = extractvalue { fp128, fp128 } [[SINCOS_F128]], 0
+// NO__ERRNO-NEXT: [[SINCOS_F128_1:%.+]] = extractvalue { fp128, fp128 } [[SINCOS_F128]], 1
+// NO__ERRNO-NEXT: store fp128 [[SINCOS_F128_0]], ptr %{{.+}}, align 16
+// NO__ERRNO-NEXT: store fp128 [[SINCOS_F128_1]], ptr %{{.+}}, align 16
+
+
 // HAS_ERRNO: declare double @fmod(double noundef, double noundef) [[NOT_READNONE:#[0-9]+]]
 // HAS_ERRNO: declare float @fmodf(float noundef, float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @fmodl(x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
@@ -665,6 +690,16 @@ __builtin_sinh(f);       __builtin_sinhf(f);      __builtin_sinhl(f); __builtin_
 // HAS_ERRNO: declare x86_fp80 @sinhl(x86_fp80 noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare fp128 @sinhf128(fp128 noundef) [[NOT_READNONE]]
 
+__builtin_sincos(f,d,d); __builtin_sincosf(f,fp,fp); __builtin_sincosl(f,l,l); __builtin_sincosf128(f,l,l);
+// NO__ERRNO: declare { double, double } @llvm.sincos.f64(double) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare { float, float } @llvm.sincos.f32(float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.sincos.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare { fp128, fp128 } @llvm.sincos.f128(fp128) [[READNONE_INTRINSIC]]
+// HAS_ERRNO: declare void @sincos(double noundef, ptr noundef, ptr noundef) [[NOT_READNONE]]
+// HAS_ERRNO: declare void @sincosf(float noundef, ptr noundef, ptr noundef) [[NOT_READNONE]]
+// HAS_ERRNO: declare void @sincosl(x86_fp80 noundef, ptr noundef, ptr noundef) [[NOT_READNONE]]
+// HAS_ERRNO: declare void @sincosf128(fp128 noundef, ptr noundef, ptr noundef) [[NOT_READNONE]]
+
 __builtin_sqrt(f);       __builtin_sqrtf(f);      __builtin_sqrtl(f); __builtin_sqrtf128(f);
 
 // NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC]]
@@ -733,4 +768,3 @@ __builtin_trunc(f);      __builtin_truncf(f);     __builtin_truncl(f); __builtin
 
 // HAS_ERRNO_GNU: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
 // HAS_ERRNO_WIN: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
-
diff --git a/clang/test/OpenMP/declare_simd_aarch64.c b/clang/test/OpenMP/declare_simd_aarch64.c
index 21c83c225963f..e9538e7446eec 100644
--- a/clang/test/OpenMP/declare_simd_aarch64.c
+++ b/clang/test/OpenMP/declare_simd_aarch64.c
@@ -1,8 +1,8 @@
 // REQUIRES: aarch64-registered-target
 // -fopemp and -fopenmp-simd behavior are expected to be the same.
 
-// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -fopenmp -x c -emit-llvm %s -o - -femit-all-decls | FileCheck %s --check-prefix=AARCH64
-// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -fopenmp-simd -x c -emit-llvm %s -o - -femit-all-decls | FileCheck %s --check-prefix=AARCH64
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -fmath-errno -fopenmp -x c -emit-llvm %s -o - -femit-all-decls | FileCheck %s --check-prefix=AARCH64
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -fmath-errno -fopenmp-simd -x c -emit-llvm %s -o - -femit-all-decls | FileCheck %s --check-prefix=AARCH64
 
 #pragma omp declare simd
 #pragma omp declare simd simdlen(2)

From 06d3d81847af351681679099ec07397fcd70fffe Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 6 Jan 2025 12:12:40 +0100
Subject: [PATCH 24/49] [libc++] Simplify unwrap_ref_decay a bit (#121623)

---
 libcxx/include/__type_traits/unwrap_ref.h | 15 ++++-----------
 libcxx/include/__utility/pair.h           |  6 ++----
 libcxx/include/tuple                      |  4 ++--
 3 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/libcxx/include/__type_traits/unwrap_ref.h b/libcxx/include/__type_traits/unwrap_ref.h
index 74c4fde915c3c..5ac037333d087 100644
--- a/libcxx/include/__type_traits/unwrap_ref.h
+++ b/libcxx/include/__type_traits/unwrap_ref.h
@@ -29,6 +29,9 @@ struct __unwrap_reference<reference_wrapper<_Tp> > {
   using type _LIBCPP_NODEBUG = _Tp&;
 };
 
+template <class _Tp>
+using __unwrap_ref_decay_t = typename __unwrap_reference<__decay_t<_Tp> >::type;
+
 #if _LIBCPP_STD_VER >= 20
 template <class _Tp>
 struct unwrap_reference : __unwrap_reference<_Tp> {};
@@ -40,19 +43,9 @@ template <class _Tp>
 struct unwrap_ref_decay : unwrap_reference<__decay_t<_Tp> > {};
 
 template <class _Tp>
-using unwrap_ref_decay_t = typename unwrap_ref_decay<_Tp>::type;
+using unwrap_ref_decay_t = __unwrap_ref_decay_t<_Tp>;
 #endif // _LIBCPP_STD_VER >= 20
 
-template <class _Tp>
-struct __unwrap_ref_decay
-#if _LIBCPP_STD_VER >= 20
-    : unwrap_ref_decay<_Tp>
-#else
-    : __unwrap_reference<__decay_t<_Tp> >
-#endif
-{
-};
-
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_TRAITS_UNWRAP_REF_H
diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h
index f9d0f4e472311..bb81e30926d7b 100644
--- a/libcxx/include/__utility/pair.h
+++ b/libcxx/include/__utility/pair.h
@@ -532,11 +532,9 @@ swap(const pair<_T1, _T2>& __x, const pair<_T1, _T2>& __y) noexcept(noexcept(__x
 #endif
 
 template <class _T1, class _T2>
-inline _LIBCPP_HIDE_FROM_ABI
-_LIBCPP_CONSTEXPR_SINCE_CXX14 pair<typename __unwrap_ref_decay<_T1>::type, typename __unwrap_ref_decay<_T2>::type>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<__unwrap_ref_decay_t<_T1>, __unwrap_ref_decay_t<_T2> >
 make_pair(_T1&& __t1, _T2&& __t2) {
-  return pair<typename __unwrap_ref_decay<_T1>::type, typename __unwrap_ref_decay<_T2>::type>(
-      std::forward<_T1>(__t1), std::forward<_T2>(__t2));
+  return pair<__unwrap_ref_decay_t<_T1>, __unwrap_ref_decay_t<_T2> >(std::forward<_T1>(__t1), std::forward<_T2>(__t2));
 }
 
 template <class _T1, class _T2>
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index b2478746f5e26..e4f1fc209b732 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -1125,9 +1125,9 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 tuple<_Tp&...> tie(_T
 }
 
 template <class... _Tp>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 tuple<typename __unwrap_ref_decay<_Tp>::type...>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 tuple<__unwrap_ref_decay_t<_Tp>...>
 make_tuple(_Tp&&... __t) {
-  return tuple<typename __unwrap_ref_decay<_Tp>::type...>(std::forward<_Tp>(__t)...);
+  return tuple<__unwrap_ref_decay_t<_Tp>...>(std::forward<_Tp>(__t)...);
 }
 
 template <class... _Tp>

From 97a04eca9b34adba2b30f813d7e638affc21cbb3 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Mon, 6 Jan 2025 19:23:21 +0800
Subject: [PATCH 25/49] [RISCV] Don't commute with shift if it would break
 sh{1,2,3}add pattern (#119527)

This fixes a regression from #101294 by checking if we might be
clobbering a sh{1,2,3}add pattern.

Only do this is the underlying add isn't going to be folded away into an
address offset.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   9 +
 .../CodeGen/RISCV/add_sext_shl_constant.ll    | 324 +++++++++++++-----
 llvm/test/CodeGen/RISCV/add_shl_constant.ll   | 251 ++++++++++----
 3 files changed, 446 insertions(+), 138 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 04dd23d9cdaa2..7efe3732d8be1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -18386,6 +18386,15 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift(
 
     auto *C1 = dyn_cast<ConstantSDNode>(N0->getOperand(1));
     auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
+
+    // Bail if we might break a sh{1,2,3}add pattern.
+    if (Subtarget.hasStdExtZba() && C2->getZExtValue() >= 1 &&
+        C2->getZExtValue() <= 3 && N->hasOneUse() &&
+        N->user_begin()->getOpcode() == ISD::ADD &&
+        !isUsedByLdSt(*N->user_begin(), nullptr) &&
+        !isa<ConstantSDNode>(N->user_begin()->getOperand(1)))
+      return false;
+
     if (C1 && C2) {
       const APInt &C1Int = C1->getAPIntValue();
       APInt ShiftedC1Int = C1Int << C2->getAPIntValue();
diff --git a/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll b/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll
index 47b6c07cc699e..fe89b4aa24171 100644
--- a/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll
+++ b/llvm/test/CodeGen/RISCV/add_sext_shl_constant.ll
@@ -1,17 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -mtriple=riscv64 < %s | FileCheck -check-prefix=RV64 %s
+; RUN: llc -mtriple=riscv64 < %s | FileCheck -check-prefixes=RV64,NO-ZBA %s
+; RUN: llc -mtriple=riscv64 -mattr=+zba < %s | FileCheck -check-prefixes=RV64,ZBA %s
 
 define void @add_sext_shl_moreOneUse_add(ptr %array1, i32 %a, i32 %b) {
-; RV64-LABEL: add_sext_shl_moreOneUse_add:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a3, a1, 5
-; RV64-NEXT:    sext.w a1, a1
-; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    sw a2, 20(a0)
-; RV64-NEXT:    sw a2, 24(a0)
-; RV64-NEXT:    sw a3, 140(a0)
-; RV64-NEXT:    ret
+; NO-ZBA-LABEL: add_sext_shl_moreOneUse_add:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a3, a1, 5
+; NO-ZBA-NEXT:    sext.w a1, a1
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a1, a0
+; NO-ZBA-NEXT:    sw a2, 20(a0)
+; NO-ZBA-NEXT:    sw a2, 24(a0)
+; NO-ZBA-NEXT:    sw a3, 140(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_sext_shl_moreOneUse_add:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a3, a1, 5
+; ZBA-NEXT:    sext.w a1, a1
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    sw a2, 20(a0)
+; ZBA-NEXT:    sw a2, 24(a0)
+; ZBA-NEXT:    sw a3, 140(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 5
   %idxprom = sext i32 %add to i64
@@ -29,19 +40,32 @@ entry:
 }
 
 define void @add_sext_shl_moreOneUse_addexceedsign12(ptr %array1, i32 %a, i32 %b) {
-; RV64-LABEL: add_sext_shl_moreOneUse_addexceedsign12:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a3, a1, 2047
-; RV64-NEXT:    lui a4, 2
-; RV64-NEXT:    sext.w a1, a1
-; RV64-NEXT:    addi a3, a3, 1
-; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    add a0, a0, a4
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    sw a2, 0(a0)
-; RV64-NEXT:    sw a3, 4(a0)
-; RV64-NEXT:    sw a2, 120(a0)
-; RV64-NEXT:    ret
+; NO-ZBA-LABEL: add_sext_shl_moreOneUse_addexceedsign12:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a3, a1, 2047
+; NO-ZBA-NEXT:    lui a4, 2
+; NO-ZBA-NEXT:    sext.w a1, a1
+; NO-ZBA-NEXT:    addi a3, a3, 1
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a0, a4
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    sw a2, 0(a0)
+; NO-ZBA-NEXT:    sw a3, 4(a0)
+; NO-ZBA-NEXT:    sw a2, 120(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_sext_shl_moreOneUse_addexceedsign12:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a3, a1, 2047
+; ZBA-NEXT:    lui a4, 2
+; ZBA-NEXT:    sext.w a1, a1
+; ZBA-NEXT:    addi a3, a3, 1
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    add a0, a0, a4
+; ZBA-NEXT:    sw a2, 0(a0)
+; ZBA-NEXT:    sw a3, 4(a0)
+; ZBA-NEXT:    sw a2, 120(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 2048
   %idxprom = sext i32 %add to i64
@@ -57,16 +81,26 @@ entry:
 }
 
 define void @add_sext_shl_moreOneUse_sext(ptr %array1, i32 %a, i32 %b) {
-; RV64-LABEL: add_sext_shl_moreOneUse_sext:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    sext.w a1, a1
-; RV64-NEXT:    addi a3, a1, 5
-; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    sw a2, 20(a0)
-; RV64-NEXT:    sw a2, 24(a0)
-; RV64-NEXT:    sd a3, 140(a0)
-; RV64-NEXT:    ret
+; NO-ZBA-LABEL: add_sext_shl_moreOneUse_sext:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    sext.w a1, a1
+; NO-ZBA-NEXT:    addi a3, a1, 5
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a1, a0
+; NO-ZBA-NEXT:    sw a2, 20(a0)
+; NO-ZBA-NEXT:    sw a2, 24(a0)
+; NO-ZBA-NEXT:    sd a3, 140(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_sext_shl_moreOneUse_sext:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    sext.w a1, a1
+; ZBA-NEXT:    addi a3, a1, 5
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    sw a2, 20(a0)
+; ZBA-NEXT:    sw a2, 24(a0)
+; ZBA-NEXT:    sd a3, 140(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 5
   %idxprom = sext i32 %add to i64
@@ -85,20 +119,34 @@ entry:
 
 ; test of jumpping, find add's operand has one more use can simplified
 define void @add_sext_shl_moreOneUse_add_inSelect(ptr %array1, i32 signext  %a, i32 %b, i32 signext %x) {
-; RV64-LABEL: add_sext_shl_moreOneUse_add_inSelect:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a4, a1, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    bgtz a3, .LBB3_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a5, a2
-; RV64-NEXT:  .LBB3_2: # %entry
-; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    sw a5, 20(a0)
-; RV64-NEXT:    sw a5, 24(a0)
-; RV64-NEXT:    sw a4, 140(a0)
-; RV64-NEXT:    ret
+; NO-ZBA-LABEL: add_sext_shl_moreOneUse_add_inSelect:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a4, a1, 5
+; NO-ZBA-NEXT:    mv a5, a4
+; NO-ZBA-NEXT:    bgtz a3, .LBB3_2
+; NO-ZBA-NEXT:  # %bb.1: # %entry
+; NO-ZBA-NEXT:    mv a5, a2
+; NO-ZBA-NEXT:  .LBB3_2: # %entry
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a1, a0
+; NO-ZBA-NEXT:    sw a5, 20(a0)
+; NO-ZBA-NEXT:    sw a5, 24(a0)
+; NO-ZBA-NEXT:    sw a4, 140(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_sext_shl_moreOneUse_add_inSelect:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a4, a1, 5
+; ZBA-NEXT:    mv a5, a4
+; ZBA-NEXT:    bgtz a3, .LBB3_2
+; ZBA-NEXT:  # %bb.1: # %entry
+; ZBA-NEXT:    mv a5, a2
+; ZBA-NEXT:  .LBB3_2: # %entry
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    sw a5, 20(a0)
+; ZBA-NEXT:    sw a5, 24(a0)
+; ZBA-NEXT:    sw a4, 140(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 5
   %cmp = icmp sgt i32 %x, 0
@@ -118,23 +166,40 @@ entry:
 }
 
 define void @add_sext_shl_moreOneUse_add_inSelect_addexceedsign12(ptr %array1, i32 signext  %a, i32 %b, i32 signext %x) {
-; RV64-LABEL: add_sext_shl_moreOneUse_add_inSelect_addexceedsign12:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a4, a1, 2047
-; RV64-NEXT:    lui a5, 2
-; RV64-NEXT:    slli a6, a1, 2
-; RV64-NEXT:    addi a1, a4, 1
-; RV64-NEXT:    add a0, a0, a6
-; RV64-NEXT:    add a0, a0, a5
-; RV64-NEXT:    mv a4, a1
-; RV64-NEXT:    bgtz a3, .LBB4_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a4, a2
-; RV64-NEXT:  .LBB4_2: # %entry
-; RV64-NEXT:    sw a4, 0(a0)
-; RV64-NEXT:    sw a4, 4(a0)
-; RV64-NEXT:    sw a1, 120(a0)
-; RV64-NEXT:    ret
+; NO-ZBA-LABEL: add_sext_shl_moreOneUse_add_inSelect_addexceedsign12:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a4, a1, 2047
+; NO-ZBA-NEXT:    lui a5, 2
+; NO-ZBA-NEXT:    slli a6, a1, 2
+; NO-ZBA-NEXT:    addi a1, a4, 1
+; NO-ZBA-NEXT:    add a0, a0, a6
+; NO-ZBA-NEXT:    add a0, a0, a5
+; NO-ZBA-NEXT:    mv a4, a1
+; NO-ZBA-NEXT:    bgtz a3, .LBB4_2
+; NO-ZBA-NEXT:  # %bb.1: # %entry
+; NO-ZBA-NEXT:    mv a4, a2
+; NO-ZBA-NEXT:  .LBB4_2: # %entry
+; NO-ZBA-NEXT:    sw a4, 0(a0)
+; NO-ZBA-NEXT:    sw a4, 4(a0)
+; NO-ZBA-NEXT:    sw a1, 120(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_sext_shl_moreOneUse_add_inSelect_addexceedsign12:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a4, a1, 2047
+; ZBA-NEXT:    lui a5, 2
+; ZBA-NEXT:    addi a4, a4, 1
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    add a0, a0, a5
+; ZBA-NEXT:    mv a1, a4
+; ZBA-NEXT:    bgtz a3, .LBB4_2
+; ZBA-NEXT:  # %bb.1: # %entry
+; ZBA-NEXT:    mv a1, a2
+; ZBA-NEXT:  .LBB4_2: # %entry
+; ZBA-NEXT:    sw a1, 0(a0)
+; ZBA-NEXT:    sw a1, 4(a0)
+; ZBA-NEXT:    sw a4, 120(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 2048
   %cmp = icmp sgt i32 %x, 0
@@ -152,20 +217,34 @@ entry:
 }
 
 define void @add_shl_moreOneUse_inSelect(ptr %array1, i64 %a, i64 %b, i64 %x) {
-; RV64-LABEL: add_shl_moreOneUse_inSelect:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a4, a1, 5
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    bgtz a3, .LBB5_2
-; RV64-NEXT:  # %bb.1: # %entry
-; RV64-NEXT:    mv a5, a2
-; RV64-NEXT:  .LBB5_2: # %entry
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    sd a5, 40(a0)
-; RV64-NEXT:    sd a5, 48(a0)
-; RV64-NEXT:    sd a4, 280(a0)
-; RV64-NEXT:    ret
+; NO-ZBA-LABEL: add_shl_moreOneUse_inSelect:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a4, a1, 5
+; NO-ZBA-NEXT:    mv a5, a4
+; NO-ZBA-NEXT:    bgtz a3, .LBB5_2
+; NO-ZBA-NEXT:  # %bb.1: # %entry
+; NO-ZBA-NEXT:    mv a5, a2
+; NO-ZBA-NEXT:  .LBB5_2: # %entry
+; NO-ZBA-NEXT:    slli a1, a1, 3
+; NO-ZBA-NEXT:    add a0, a1, a0
+; NO-ZBA-NEXT:    sd a5, 40(a0)
+; NO-ZBA-NEXT:    sd a5, 48(a0)
+; NO-ZBA-NEXT:    sd a4, 280(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_inSelect:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a4, a1, 5
+; ZBA-NEXT:    mv a5, a4
+; ZBA-NEXT:    bgtz a3, .LBB5_2
+; ZBA-NEXT:  # %bb.1: # %entry
+; ZBA-NEXT:    mv a5, a2
+; ZBA-NEXT:  .LBB5_2: # %entry
+; ZBA-NEXT:    sh3add a0, a1, a0
+; ZBA-NEXT:    sd a5, 40(a0)
+; ZBA-NEXT:    sd a5, 48(a0)
+; ZBA-NEXT:    sd a4, 280(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i64 %a, 5
   %cmp = icmp sgt i64 %x, 0
@@ -180,3 +259,90 @@ entry:
   store i64 %add, ptr %arrayidx6
   ret void
 }
+
+define i64 @add_shl_moreOneUse_sh1add(i64 %x) {
+; NO-ZBA-LABEL: add_shl_moreOneUse_sh1add:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    ori a1, a0, 1
+; NO-ZBA-NEXT:    slli a0, a0, 1
+; NO-ZBA-NEXT:    ori a0, a0, 2
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_sh1add:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    ori a0, a0, 1
+; ZBA-NEXT:    sh1add a0, a0, a0
+; ZBA-NEXT:    ret
+  %or = or i64 %x, 1
+  %mul = shl i64 %or, 1
+  %add = add i64 %mul, %or
+  ret i64 %add
+}
+
+define i64 @add_shl_moreOneUse_sh2add(i64 %x) {
+; NO-ZBA-LABEL: add_shl_moreOneUse_sh2add:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    ori a1, a0, 1
+; NO-ZBA-NEXT:    slli a0, a0, 2
+; NO-ZBA-NEXT:    ori a0, a0, 4
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_sh2add:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    ori a0, a0, 1
+; ZBA-NEXT:    sh2add a0, a0, a0
+; ZBA-NEXT:    ret
+  %or = or i64 %x, 1
+  %mul = shl i64 %or, 2
+  %add = add i64 %mul, %or
+  ret i64 %add
+}
+
+define i64 @add_shl_moreOneUse_sh3add(i64 %x) {
+; NO-ZBA-LABEL: add_shl_moreOneUse_sh3add:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    ori a1, a0, 1
+; NO-ZBA-NEXT:    slli a0, a0, 3
+; NO-ZBA-NEXT:    ori a0, a0, 8
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_sh3add:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    ori a0, a0, 1
+; ZBA-NEXT:    sh3add a0, a0, a0
+; ZBA-NEXT:    ret
+  %or = or i64 %x, 1
+  %mul = shl i64 %or, 3
+  %add = add i64 %mul, %or
+  ret i64 %add
+}
+
+define i64 @add_shl_moreOneUse_sh4add(i64 %x) {
+; RV64-LABEL: add_shl_moreOneUse_sh4add:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ori a1, a0, 1
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    ori a0, a0, 16
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    ret
+  %or = or i64 %x, 1
+  %mul = shl i64 %or, 4
+  %add = add i64 %mul, %or
+  ret i64 %add
+}
+
+define i64 @add_shl_rhs_constant(i64 %x, i64 %y) {
+; RV64-LABEL: add_shl_rhs_constant:
+; RV64:       # %bb.0:
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    ret
+  %a = add i64 %x, 1
+  %b = add i64 %y, %a
+  %c = shl i64 %b, 3
+  %d = add i64 %c, -8
+  ret i64 %d
+}
diff --git a/llvm/test/CodeGen/RISCV/add_shl_constant.ll b/llvm/test/CodeGen/RISCV/add_shl_constant.ll
index 71b61868b8c84..a4da9e2683648 100644
--- a/llvm/test/CodeGen/RISCV/add_shl_constant.ll
+++ b/llvm/test/CodeGen/RISCV/add_shl_constant.ll
@@ -1,13 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=riscv32  < %s | FileCheck -check-prefix=RV32 %s
+; RUN: llc -mtriple=riscv32  < %s | FileCheck -check-prefixes=RV32,NO-ZBA %s
+; RUN: llc -mtriple=riscv32 -mattr=+zba  < %s | FileCheck -check-prefixes=RV32,ZBA %s
 
 define i32 @add_shl_oneUse(i32 %x, i32 %y) nounwind {
-; RV32-LABEL: add_shl_oneUse:
-; RV32:       # %bb.0:
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    addi a0, a0, 984
-; RV32-NEXT:    ret
+; NO-ZBA-LABEL: add_shl_oneUse:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    slli a0, a0, 3
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    addi a0, a0, 984
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_oneUse:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    addi a0, a0, 123
+; ZBA-NEXT:    sh3add a0, a0, a1
+; ZBA-NEXT:    ret
   %add.0 = add i32 %x, 123
   %shl = shl i32 %add.0, 3
   %add.1 = add i32 %shl, %y
@@ -15,15 +22,24 @@ define i32 @add_shl_oneUse(i32 %x, i32 %y) nounwind {
 }
 
 define void @add_shl_moreOneUse_inStore(ptr %array1, i32 %a, i32 %b)  {
-; RV32-LABEL: add_shl_moreOneUse_inStore:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi a3, a1, 5
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    sw a2, 20(a0)
-; RV32-NEXT:    sw a2, 24(a0)
-; RV32-NEXT:    sw a3, 140(a0)
-; RV32-NEXT:    ret
+; NO-ZBA-LABEL: add_shl_moreOneUse_inStore:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a3, a1, 5
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    sw a2, 20(a0)
+; NO-ZBA-NEXT:    sw a2, 24(a0)
+; NO-ZBA-NEXT:    sw a3, 140(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_inStore:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a3, a1, 5
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    sw a2, 20(a0)
+; ZBA-NEXT:    sw a2, 24(a0)
+; ZBA-NEXT:    sw a3, 140(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 5
   %arrayidx = getelementptr inbounds i32, ptr %array1, i32 %add
@@ -37,18 +53,30 @@ entry:
 }
 
 define void @add_shl_moreOneUse_inStore_addexceedsign12(ptr %array1, i32 %a, i32 %b)  {
-; RV32-LABEL: add_shl_moreOneUse_inStore_addexceedsign12:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi a3, a1, 2047
-; RV32-NEXT:    lui a4, 2
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    addi a3, a3, 1
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a0, a4
-; RV32-NEXT:    sw a2, 0(a0)
-; RV32-NEXT:    sw a3, 4(a0)
-; RV32-NEXT:    sw a2, 120(a0)
-; RV32-NEXT:    ret
+; NO-ZBA-LABEL: add_shl_moreOneUse_inStore_addexceedsign12:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a3, a1, 2047
+; NO-ZBA-NEXT:    lui a4, 2
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    addi a3, a3, 1
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    add a0, a0, a4
+; NO-ZBA-NEXT:    sw a2, 0(a0)
+; NO-ZBA-NEXT:    sw a3, 4(a0)
+; NO-ZBA-NEXT:    sw a2, 120(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_inStore_addexceedsign12:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a3, a1, 2047
+; ZBA-NEXT:    lui a4, 2
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    addi a3, a3, 1
+; ZBA-NEXT:    add a0, a0, a4
+; ZBA-NEXT:    sw a2, 0(a0)
+; ZBA-NEXT:    sw a3, 4(a0)
+; ZBA-NEXT:    sw a2, 120(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 2048
   %arrayidx = getelementptr inbounds i32, ptr %array1, i32 %add
@@ -62,20 +90,34 @@ entry:
 }
 
 define void @add_shl_moreOneUse_inSelect(ptr %array1, i32 %a, i32 %b, i32 %x) {
-; RV32-LABEL: add_shl_moreOneUse_inSelect:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi a4, a1, 5
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    bgtz a3, .LBB3_2
-; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    mv a5, a2
-; RV32-NEXT:  .LBB3_2: # %entry
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    sw a5, 20(a0)
-; RV32-NEXT:    sw a5, 24(a0)
-; RV32-NEXT:    sw a4, 140(a0)
-; RV32-NEXT:    ret
+; NO-ZBA-LABEL: add_shl_moreOneUse_inSelect:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a4, a1, 5
+; NO-ZBA-NEXT:    mv a5, a4
+; NO-ZBA-NEXT:    bgtz a3, .LBB3_2
+; NO-ZBA-NEXT:  # %bb.1: # %entry
+; NO-ZBA-NEXT:    mv a5, a2
+; NO-ZBA-NEXT:  .LBB3_2: # %entry
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    sw a5, 20(a0)
+; NO-ZBA-NEXT:    sw a5, 24(a0)
+; NO-ZBA-NEXT:    sw a4, 140(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_inSelect:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a4, a1, 5
+; ZBA-NEXT:    mv a5, a4
+; ZBA-NEXT:    bgtz a3, .LBB3_2
+; ZBA-NEXT:  # %bb.1: # %entry
+; ZBA-NEXT:    mv a5, a2
+; ZBA-NEXT:  .LBB3_2: # %entry
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    sw a5, 20(a0)
+; ZBA-NEXT:    sw a5, 24(a0)
+; ZBA-NEXT:    sw a4, 140(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 5
   %cmp = icmp sgt i32 %x, 0
@@ -91,23 +133,40 @@ entry:
 }
 
 define void @add_shl_moreOneUse_inSelect_addexceedsign12(ptr %array1, i32 %a, i32 %b, i32 %x) {
-; RV32-LABEL: add_shl_moreOneUse_inSelect_addexceedsign12:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi a4, a1, 2047
-; RV32-NEXT:    addi a4, a4, 1
-; RV32-NEXT:    mv a5, a4
-; RV32-NEXT:    bgtz a3, .LBB4_2
-; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    mv a5, a2
-; RV32-NEXT:  .LBB4_2: # %entry
-; RV32-NEXT:    lui a2, 2
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    sw a5, 0(a0)
-; RV32-NEXT:    sw a5, 4(a0)
-; RV32-NEXT:    sw a4, 120(a0)
-; RV32-NEXT:    ret
+; NO-ZBA-LABEL: add_shl_moreOneUse_inSelect_addexceedsign12:
+; NO-ZBA:       # %bb.0: # %entry
+; NO-ZBA-NEXT:    addi a4, a1, 2047
+; NO-ZBA-NEXT:    addi a4, a4, 1
+; NO-ZBA-NEXT:    mv a5, a4
+; NO-ZBA-NEXT:    bgtz a3, .LBB4_2
+; NO-ZBA-NEXT:  # %bb.1: # %entry
+; NO-ZBA-NEXT:    mv a5, a2
+; NO-ZBA-NEXT:  .LBB4_2: # %entry
+; NO-ZBA-NEXT:    lui a2, 2
+; NO-ZBA-NEXT:    slli a1, a1, 2
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    add a0, a0, a2
+; NO-ZBA-NEXT:    sw a5, 0(a0)
+; NO-ZBA-NEXT:    sw a5, 4(a0)
+; NO-ZBA-NEXT:    sw a4, 120(a0)
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_inSelect_addexceedsign12:
+; ZBA:       # %bb.0: # %entry
+; ZBA-NEXT:    addi a4, a1, 2047
+; ZBA-NEXT:    addi a4, a4, 1
+; ZBA-NEXT:    mv a5, a4
+; ZBA-NEXT:    bgtz a3, .LBB4_2
+; ZBA-NEXT:  # %bb.1: # %entry
+; ZBA-NEXT:    mv a5, a2
+; ZBA-NEXT:  .LBB4_2: # %entry
+; ZBA-NEXT:    lui a2, 2
+; ZBA-NEXT:    sh2add a0, a1, a0
+; ZBA-NEXT:    add a0, a0, a2
+; ZBA-NEXT:    sw a5, 0(a0)
+; ZBA-NEXT:    sw a5, 4(a0)
+; ZBA-NEXT:    sw a4, 120(a0)
+; ZBA-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 2048
   %cmp = icmp sgt i32 %x, 0
@@ -121,3 +180,77 @@ entry:
   store i32 %add, ptr %arrayidx6
   ret void
 }
+
+define i32 @add_shl_moreOneUse_sh1add(i32 %x) {
+; NO-ZBA-LABEL: add_shl_moreOneUse_sh1add:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    ori a1, a0, 1
+; NO-ZBA-NEXT:    slli a0, a0, 1
+; NO-ZBA-NEXT:    ori a0, a0, 2
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_sh1add:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    ori a0, a0, 1
+; ZBA-NEXT:    sh1add a0, a0, a0
+; ZBA-NEXT:    ret
+  %or = or i32 %x, 1
+  %mul = shl i32 %or, 1
+  %add = add i32 %mul, %or
+  ret i32 %add
+}
+
+define i32 @add_shl_moreOneUse_sh2add(i32 %x) {
+; NO-ZBA-LABEL: add_shl_moreOneUse_sh2add:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    ori a1, a0, 1
+; NO-ZBA-NEXT:    slli a0, a0, 2
+; NO-ZBA-NEXT:    ori a0, a0, 4
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_sh2add:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    ori a0, a0, 1
+; ZBA-NEXT:    sh2add a0, a0, a0
+; ZBA-NEXT:    ret
+  %or = or i32 %x, 1
+  %mul = shl i32 %or, 2
+  %add = add i32 %mul, %or
+  ret i32 %add
+}
+
+define i32 @add_shl_moreOneUse_sh3add(i32 %x) {
+; NO-ZBA-LABEL: add_shl_moreOneUse_sh3add:
+; NO-ZBA:       # %bb.0:
+; NO-ZBA-NEXT:    ori a1, a0, 1
+; NO-ZBA-NEXT:    slli a0, a0, 3
+; NO-ZBA-NEXT:    ori a0, a0, 8
+; NO-ZBA-NEXT:    add a0, a0, a1
+; NO-ZBA-NEXT:    ret
+;
+; ZBA-LABEL: add_shl_moreOneUse_sh3add:
+; ZBA:       # %bb.0:
+; ZBA-NEXT:    ori a0, a0, 1
+; ZBA-NEXT:    sh3add a0, a0, a0
+; ZBA-NEXT:    ret
+  %or = or i32 %x, 1
+  %mul = shl i32 %or, 3
+  %add = add i32 %mul, %or
+  ret i32 %add
+}
+
+define i32 @add_shl_moreOneUse_sh4add(i32 %x) {
+; RV32-LABEL: add_shl_moreOneUse_sh4add:
+; RV32:       # %bb.0:
+; RV32-NEXT:    ori a1, a0, 1
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    ori a0, a0, 16
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    ret
+  %or = or i32 %x, 1
+  %mul = shl i32 %or, 4
+  %add = add i32 %mul, %or
+  ret i32 %add
+}

From d691d300f2960c89064c2ea5a4938231b06f9307 Mon Sep 17 00:00:00 2001
From: Dhruv Srivastava <dhruv.srivastava@ibm.com>
Date: Mon, 6 Jan 2025 16:56:38 +0530
Subject: [PATCH 26/49] [lldb][POSIX] Replace bzero with memset in domain
 socket code (#121747)

Replace `bzero` with the standard `memset` so that it is common to all platforms.
---
 lldb/source/Host/posix/DomainSocket.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Host/posix/DomainSocket.cpp b/lldb/source/Host/posix/DomainSocket.cpp
index f85e1b9bbdc5c..be8fcdf2c8f2c 100644
--- a/lldb/source/Host/posix/DomainSocket.cpp
+++ b/lldb/source/Host/posix/DomainSocket.cpp
@@ -182,7 +182,7 @@ std::vector<std::string> DomainSocket::GetListeningConnectionURI() const {
     return {};
 
   struct sockaddr_un addr;
-  bzero(&addr, sizeof(struct sockaddr_un));
+  memset(&addr, 0, sizeof(struct sockaddr_un));
   addr.sun_family = AF_UNIX;
   socklen_t addr_len = sizeof(struct sockaddr_un);
   if (::getsockname(m_socket, (struct sockaddr *)&addr, &addr_len) != 0)

From 30498da7284a53fbee60854eb03f09cd43385eea Mon Sep 17 00:00:00 2001
From: staz <alyystaz@gmail.com>
Date: Mon, 6 Jan 2025 16:27:24 +0500
Subject: [PATCH 27/49] [docs] fix grammar mistake (#121695)

Fix a grammar mistake in Polly docs.

Co-authored-by: hstk30-hw <hanwei62@huawei.com>
---
 polly/docs/UsingPollyWithClang.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polly/docs/UsingPollyWithClang.rst b/polly/docs/UsingPollyWithClang.rst
index 08fdcbca13743..2aa35a08b0251 100644
--- a/polly/docs/UsingPollyWithClang.rst
+++ b/polly/docs/UsingPollyWithClang.rst
@@ -101,7 +101,7 @@ polly, after SSA transformation, loop canonicalization, inlining and
 other passes.
 
 Thereafter, any Polly pass can be run over 'before-polly.ll' using the
-'opt' tool.  To found out which Polly passes are active in the standard
+'opt' tool.  To find out which Polly passes are active in the standard
 pipeline, see the output of
 
 .. code-block:: console

From f17c586f0864de3f7475d5569165853fc21d7e0d Mon Sep 17 00:00:00 2001
From: Arseniy Zaostrovnykh <necto.ne@gmail.com>
Date: Mon, 6 Jan 2025 12:45:31 +0100
Subject: [PATCH 28/49] Reapply "[clang][analyzer] Stable order for
 SymbolRef-keyed containers" (#121749)

Generalize the SymbolIDs used for SymbolData to all SymExprs and use
these IDs for comparison SymbolRef keys in various containers, such as
ConstraintMap. These IDs are superior to raw pointer values because they
are more controllable and are not randomized across executions (unlike
[pointers](https://en.wikipedia.org/wiki/Address_space_layout_randomization)).

These IDs order is stable across runs because SymExprs are allocated in
the same order.

Stability of the constraint order is important for the stability of the
analyzer results. I evaluated this change on a set of 200+ open-source C
and C++ projects with the total number of ~78 000 symbolic-execution
issues passing Z3 refutation.

This patch reduced the run-to-run churn (flakiness) in SE issues from
80-90 to 30-40 (out of 78K) in our CSA deployment (in our setting flaky
issues are mostly due to Z3 refutation instability).

Note, most of the issue churn (flakiness) is caused by the mentioned Z3
refutation. With Z3 refutation disabled, issue churn goes down to ~10
issues out of 83K and this patch has no effect on appearing/disappearing
issues between runs. It however, seems to reduce the volatility of the
execution flow: before we had 40-80 issues with changed execution flow,
after - 10-30.

Importantly, this change is necessary for the next step in stabilizing
analysis results by caching Z3 query outcomes between analysis runs
(work in progress).

Across our admittedly noisy CI runs, I detected no significant effect on
memory footprint or analysis time.

This PR reapplies https://github.com/llvm/llvm-project/pull/121551 with
a fix to a g++ compiler error reported on some build bots

CPP-5919
---
 .../Core/PathSensitive/SymExpr.h              |  31 ++++--
 .../Core/PathSensitive/SymbolManager.h        | 100 ++++++++++++++----
 .../lib/StaticAnalyzer/Core/SymbolManager.cpp |  25 ++---
 clang/test/Analysis/dump_egraph.cpp           |   2 +-
 .../expr-inspection-printState-diseq-info.c   |  12 +--
 .../expr-inspection-printState-eq-classes.c   |   4 +-
 clang/test/Analysis/ptr-arith.cpp             |   4 +-
 ...symbol-simplification-disequality-info.cpp |  20 ++--
 ...-simplification-fixpoint-one-iteration.cpp |  12 +--
 ...simplification-fixpoint-two-iterations.cpp |  18 ++--
 clang/test/Analysis/unary-sym-expr.c          |   6 +-
 11 files changed, 149 insertions(+), 85 deletions(-)

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h
index 862a30c0e7363..aca14cf813c4b 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h
@@ -25,6 +25,8 @@ namespace ento {
 
 class MemRegion;
 
+using SymbolID = unsigned;
+
 /// Symbolic value. These values used to capture symbolic execution of
 /// the program.
 class SymExpr : public llvm::FoldingSetNode {
@@ -39,9 +41,19 @@ class SymExpr : public llvm::FoldingSetNode {
 
 private:
   Kind K;
+  /// A unique identifier for this symbol.
+  ///
+  /// It is useful for SymbolData to easily differentiate multiple symbols, but
+  /// also for "ephemeral" symbols, such as binary operations, because this id
+  /// can be used for arranging constraints or equivalence classes instead of
+  /// unstable pointer values.
+  ///
+  /// Note, however, that it can't be used in Profile because SymbolManager
+  /// needs to compute Profile before allocating SymExpr.
+  const SymbolID Sym;
 
 protected:
-  SymExpr(Kind k) : K(k) {}
+  SymExpr(Kind k, SymbolID Sym) : K(k), Sym(Sym) {}
 
   static bool isValidTypeForSymbol(QualType T) {
     // FIXME: Depending on whether we choose to deprecate structural symbols,
@@ -56,6 +68,14 @@ class SymExpr : public llvm::FoldingSetNode {
 
   Kind getKind() const { return K; }
 
+  /// Get a unique identifier for this symbol.
+  /// The ID is unique across all SymExprs in a SymbolManager.
+  /// They reflect the allocation order of these SymExprs,
+  /// and are likely stable across runs.
+  /// Used as a key in SymbolRef containers and as part of identity
+  /// for SymbolData, e.g. SymbolConjured with ID = 7 is "conj_$7".
+  SymbolID getSymbolID() const { return Sym; }
+
   virtual void dump() const;
 
   virtual void dumpToStream(raw_ostream &os) const {}
@@ -112,19 +132,14 @@ inline raw_ostream &operator<<(raw_ostream &os,
 
 using SymbolRef = const SymExpr *;
 using SymbolRefSmallVectorTy = SmallVector<SymbolRef, 2>;
-using SymbolID = unsigned;
 
 /// A symbol representing data which can be stored in a memory location
 /// (region).
 class SymbolData : public SymExpr {
-  const SymbolID Sym;
-
   void anchor() override;
 
 protected:
-  SymbolData(Kind k, SymbolID sym) : SymExpr(k), Sym(sym) {
-    assert(classof(this));
-  }
+  SymbolData(Kind k, SymbolID sym) : SymExpr(k, sym) { assert(classof(this)); }
 
 public:
   ~SymbolData() override = default;
@@ -132,8 +147,6 @@ class SymbolData : public SymExpr {
   /// Get a string representation of the kind of the region.
   virtual StringRef getKindStr() const = 0;
 
-  SymbolID getSymbolID() const { return Sym; }
-
   unsigned computeComplexity() const override {
     return 1;
   };
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
index 73732d532f630..c530dff495238 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
@@ -25,6 +25,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/ImmutableSet.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Allocator.h"
 #include <cassert>
@@ -43,15 +44,16 @@ class StoreManager;
 class SymbolRegionValue : public SymbolData {
   const TypedValueRegion *R;
 
-public:
+  friend class SymExprAllocator;
   SymbolRegionValue(SymbolID sym, const TypedValueRegion *r)
       : SymbolData(SymbolRegionValueKind, sym), R(r) {
     assert(r);
     assert(isValidTypeForSymbol(r->getValueType()));
   }
 
+public:
   LLVM_ATTRIBUTE_RETURNS_NONNULL
-  const TypedValueRegion* getRegion() const { return R; }
+  const TypedValueRegion *getRegion() const { return R; }
 
   static void Profile(llvm::FoldingSetNodeID& profile, const TypedValueRegion* R) {
     profile.AddInteger((unsigned) SymbolRegionValueKind);
@@ -84,7 +86,7 @@ class SymbolConjured : public SymbolData {
   const LocationContext *LCtx;
   const void *SymbolTag;
 
-public:
+  friend class SymExprAllocator;
   SymbolConjured(SymbolID sym, const Stmt *s, const LocationContext *lctx,
                  QualType t, unsigned count, const void *symbolTag)
       : SymbolData(SymbolConjuredKind, sym), S(s), T(t), Count(count),
@@ -98,6 +100,7 @@ class SymbolConjured : public SymbolData {
     assert(isValidTypeForSymbol(t));
   }
 
+public:
   /// It might return null.
   const Stmt *getStmt() const { return S; }
   unsigned getCount() const { return Count; }
@@ -137,7 +140,7 @@ class SymbolDerived : public SymbolData {
   SymbolRef parentSymbol;
   const TypedValueRegion *R;
 
-public:
+  friend class SymExprAllocator;
   SymbolDerived(SymbolID sym, SymbolRef parent, const TypedValueRegion *r)
       : SymbolData(SymbolDerivedKind, sym), parentSymbol(parent), R(r) {
     assert(parent);
@@ -145,6 +148,7 @@ class SymbolDerived : public SymbolData {
     assert(isValidTypeForSymbol(r->getValueType()));
   }
 
+public:
   LLVM_ATTRIBUTE_RETURNS_NONNULL
   SymbolRef getParentSymbol() const { return parentSymbol; }
   LLVM_ATTRIBUTE_RETURNS_NONNULL
@@ -180,12 +184,13 @@ class SymbolDerived : public SymbolData {
 class SymbolExtent : public SymbolData {
   const SubRegion *R;
 
-public:
+  friend class SymExprAllocator;
   SymbolExtent(SymbolID sym, const SubRegion *r)
       : SymbolData(SymbolExtentKind, sym), R(r) {
     assert(r);
   }
 
+public:
   LLVM_ATTRIBUTE_RETURNS_NONNULL
   const SubRegion *getRegion() const { return R; }
 
@@ -222,7 +227,7 @@ class SymbolMetadata : public SymbolData {
   unsigned Count;
   const void *Tag;
 
-public:
+  friend class SymExprAllocator;
   SymbolMetadata(SymbolID sym, const MemRegion* r, const Stmt *s, QualType t,
                  const LocationContext *LCtx, unsigned count, const void *tag)
       : SymbolData(SymbolMetadataKind, sym), R(r), S(s), T(t), LCtx(LCtx),
@@ -234,6 +239,7 @@ class SymbolMetadata : public SymbolData {
       assert(tag);
     }
 
+  public:
     LLVM_ATTRIBUTE_RETURNS_NONNULL
     const MemRegion *getRegion() const { return R; }
 
@@ -286,15 +292,16 @@ class SymbolCast : public SymExpr {
   /// The type of the result.
   QualType ToTy;
 
-public:
-  SymbolCast(const SymExpr *In, QualType From, QualType To)
-      : SymExpr(SymbolCastKind), Operand(In), FromTy(From), ToTy(To) {
+  friend class SymExprAllocator;
+  SymbolCast(SymbolID Sym, const SymExpr *In, QualType From, QualType To)
+      : SymExpr(SymbolCastKind, Sym), Operand(In), FromTy(From), ToTy(To) {
     assert(In);
     assert(isValidTypeForSymbol(From));
     // FIXME: GenericTaintChecker creates symbols of void type.
     // Otherwise, 'To' should also be a valid type.
   }
 
+public:
   unsigned computeComplexity() const override {
     if (Complexity == 0)
       Complexity = 1 + Operand->computeComplexity();
@@ -332,9 +339,10 @@ class UnarySymExpr : public SymExpr {
   UnaryOperator::Opcode Op;
   QualType T;
 
-public:
-  UnarySymExpr(const SymExpr *In, UnaryOperator::Opcode Op, QualType T)
-      : SymExpr(UnarySymExprKind), Operand(In), Op(Op), T(T) {
+  friend class SymExprAllocator;
+  UnarySymExpr(SymbolID Sym, const SymExpr *In, UnaryOperator::Opcode Op,
+               QualType T)
+      : SymExpr(UnarySymExprKind, Sym), Operand(In), Op(Op), T(T) {
     // Note, some unary operators are modeled as a binary operator. E.g. ++x is
     // modeled as x + 1.
     assert((Op == UO_Minus || Op == UO_Not) && "non-supported unary expression");
@@ -345,6 +353,7 @@ class UnarySymExpr : public SymExpr {
     assert(!Loc::isLocType(T) && "unary symbol should be nonloc");
   }
 
+public:
   unsigned computeComplexity() const override {
     if (Complexity == 0)
       Complexity = 1 + Operand->computeComplexity();
@@ -381,8 +390,8 @@ class BinarySymExpr : public SymExpr {
   QualType T;
 
 protected:
-  BinarySymExpr(Kind k, BinaryOperator::Opcode op, QualType t)
-      : SymExpr(k), Op(op), T(t) {
+  BinarySymExpr(SymbolID Sym, Kind k, BinaryOperator::Opcode op, QualType t)
+      : SymExpr(k, Sym), Op(op), T(t) {
     assert(classof(this));
     // Binary expressions are results of arithmetic. Pointer arithmetic is not
     // handled by binary expressions, but it is instead handled by applying
@@ -425,14 +434,15 @@ class BinarySymExprImpl : public BinarySymExpr {
   LHSTYPE LHS;
   RHSTYPE RHS;
 
-public:
-  BinarySymExprImpl(LHSTYPE lhs, BinaryOperator::Opcode op, RHSTYPE rhs,
-                    QualType t)
-      : BinarySymExpr(ClassKind, op, t), LHS(lhs), RHS(rhs) {
+  friend class SymExprAllocator;
+  BinarySymExprImpl(SymbolID Sym, LHSTYPE lhs, BinaryOperator::Opcode op,
+                    RHSTYPE rhs, QualType t)
+      : BinarySymExpr(Sym, ClassKind, op, t), LHS(lhs), RHS(rhs) {
     assert(getPointer(lhs));
     assert(getPointer(rhs));
   }
 
+public:
   void dumpToStream(raw_ostream &os) const override {
     dumpToStreamImpl(os, LHS);
     dumpToStreamImpl(os, getOpcode());
@@ -478,6 +488,21 @@ using IntSymExpr = BinarySymExprImpl<APSIntPtr, const SymExpr *,
 using SymSymExpr = BinarySymExprImpl<const SymExpr *, const SymExpr *,
                                      SymExpr::Kind::SymSymExprKind>;
 
+class SymExprAllocator {
+  SymbolID NextSymbolID = 0;
+  llvm::BumpPtrAllocator &Alloc;
+
+public:
+  explicit SymExprAllocator(llvm::BumpPtrAllocator &Alloc) : Alloc(Alloc) {}
+
+  template <class SymT, typename... ArgsT> SymT *make(ArgsT &&...Args) {
+    return new (Alloc) SymT(nextID(), std::forward<ArgsT>(Args)...);
+  }
+
+private:
+  SymbolID nextID() { return NextSymbolID++; }
+};
+
 class SymbolManager {
   using DataSetTy = llvm::FoldingSet<SymExpr>;
   using SymbolDependTy =
@@ -489,15 +514,14 @@ class SymbolManager {
   /// alive as long as the key is live.
   SymbolDependTy SymbolDependencies;
 
-  unsigned SymbolCounter = 0;
-  llvm::BumpPtrAllocator& BPAlloc;
+  SymExprAllocator Alloc;
   BasicValueFactory &BV;
   ASTContext &Ctx;
 
 public:
   SymbolManager(ASTContext &ctx, BasicValueFactory &bv,
-                llvm::BumpPtrAllocator& bpalloc)
-      : SymbolDependencies(16), BPAlloc(bpalloc), BV(bv), Ctx(ctx) {}
+                llvm::BumpPtrAllocator &bpalloc)
+      : SymbolDependencies(16), Alloc(bpalloc), BV(bv), Ctx(ctx) {}
 
   static bool canSymbolicate(QualType T);
 
@@ -687,4 +711,36 @@ class SymbolVisitor {
 
 } // namespace clang
 
+// Override the default definition that would use pointer values of SymbolRefs
+// to order them, which is unstable due to ASLR.
+// Use the SymbolID instead which reflect the order in which the symbols were
+// allocated. This is usually stable across runs leading to the stability of
+// ConstraintMap and other containers using SymbolRef as keys.
+template <>
+struct llvm::ImutContainerInfo<clang::ento::SymbolRef>
+    : public ImutProfileInfo<clang::ento::SymbolRef> {
+  using value_type = clang::ento::SymbolRef;
+  using value_type_ref = clang::ento::SymbolRef;
+  using key_type = value_type;
+  using key_type_ref = value_type_ref;
+  using data_type = bool;
+  using data_type_ref = bool;
+
+  static key_type_ref KeyOfValue(value_type_ref D) { return D; }
+  static data_type_ref DataOfValue(value_type_ref) { return true; }
+
+  static bool isEqual(clang::ento::SymbolRef LHS, clang::ento::SymbolRef RHS) {
+    return LHS->getSymbolID() == RHS->getSymbolID();
+  }
+
+  static bool isLess(clang::ento::SymbolRef LHS, clang::ento::SymbolRef RHS) {
+    return LHS->getSymbolID() < RHS->getSymbolID();
+  }
+
+  // This might seem redundant, but it is required because of the way
+  // ImmutableSet is implemented through AVLTree:
+  // same as ImmutableMap, but with a non-informative "data".
+  static bool isDataEqual(data_type_ref, data_type_ref) { return true; }
+};
+
 #endif // LLVM_CLANG_STATICANALYZER_CORE_PATHSENSITIVE_SYMBOLMANAGER_H
diff --git a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp
index f21e5c3ad7bd7..738b6a175ce6d 100644
--- a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp
@@ -170,9 +170,8 @@ SymbolManager::getRegionValueSymbol(const TypedValueRegion* R) {
   void *InsertPos;
   SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos);
   if (!SD) {
-    SD = new (BPAlloc) SymbolRegionValue(SymbolCounter, R);
+    SD = Alloc.make<SymbolRegionValue>(R);
     DataSet.InsertNode(SD, InsertPos);
-    ++SymbolCounter;
   }
 
   return cast<SymbolRegionValue>(SD);
@@ -188,9 +187,8 @@ const SymbolConjured* SymbolManager::conjureSymbol(const Stmt *E,
   void *InsertPos;
   SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos);
   if (!SD) {
-    SD = new (BPAlloc) SymbolConjured(SymbolCounter, E, LCtx, T, Count, SymbolTag);
+    SD = Alloc.make<SymbolConjured>(E, LCtx, T, Count, SymbolTag);
     DataSet.InsertNode(SD, InsertPos);
-    ++SymbolCounter;
   }
 
   return cast<SymbolConjured>(SD);
@@ -204,9 +202,8 @@ SymbolManager::getDerivedSymbol(SymbolRef parentSymbol,
   void *InsertPos;
   SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos);
   if (!SD) {
-    SD = new (BPAlloc) SymbolDerived(SymbolCounter, parentSymbol, R);
+    SD = Alloc.make<SymbolDerived>(parentSymbol, R);
     DataSet.InsertNode(SD, InsertPos);
-    ++SymbolCounter;
   }
 
   return cast<SymbolDerived>(SD);
@@ -219,9 +216,8 @@ SymbolManager::getExtentSymbol(const SubRegion *R) {
   void *InsertPos;
   SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos);
   if (!SD) {
-    SD = new (BPAlloc) SymbolExtent(SymbolCounter, R);
+    SD = Alloc.make<SymbolExtent>(R);
     DataSet.InsertNode(SD, InsertPos);
-    ++SymbolCounter;
   }
 
   return cast<SymbolExtent>(SD);
@@ -236,9 +232,8 @@ SymbolManager::getMetadataSymbol(const MemRegion* R, const Stmt *S, QualType T,
   void *InsertPos;
   SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos);
   if (!SD) {
-    SD = new (BPAlloc) SymbolMetadata(SymbolCounter, R, S, T, LCtx, Count, SymbolTag);
+    SD = Alloc.make<SymbolMetadata>(R, S, T, LCtx, Count, SymbolTag);
     DataSet.InsertNode(SD, InsertPos);
-    ++SymbolCounter;
   }
 
   return cast<SymbolMetadata>(SD);
@@ -252,7 +247,7 @@ SymbolManager::getCastSymbol(const SymExpr *Op,
   void *InsertPos;
   SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos);
   if (!data) {
-    data = new (BPAlloc) SymbolCast(Op, From, To);
+    data = Alloc.make<SymbolCast>(Op, From, To);
     DataSet.InsertNode(data, InsertPos);
   }
 
@@ -268,7 +263,7 @@ const SymIntExpr *SymbolManager::getSymIntExpr(const SymExpr *lhs,
   SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos);
 
   if (!data) {
-    data = new (BPAlloc) SymIntExpr(lhs, op, v, t);
+    data = Alloc.make<SymIntExpr>(lhs, op, v, t);
     DataSet.InsertNode(data, InsertPos);
   }
 
@@ -284,7 +279,7 @@ const IntSymExpr *SymbolManager::getIntSymExpr(APSIntPtr lhs,
   SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos);
 
   if (!data) {
-    data = new (BPAlloc) IntSymExpr(lhs, op, rhs, t);
+    data = Alloc.make<IntSymExpr>(lhs, op, rhs, t);
     DataSet.InsertNode(data, InsertPos);
   }
 
@@ -301,7 +296,7 @@ const SymSymExpr *SymbolManager::getSymSymExpr(const SymExpr *lhs,
   SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos);
 
   if (!data) {
-    data = new (BPAlloc) SymSymExpr(lhs, op, rhs, t);
+    data = Alloc.make<SymSymExpr>(lhs, op, rhs, t);
     DataSet.InsertNode(data, InsertPos);
   }
 
@@ -316,7 +311,7 @@ const UnarySymExpr *SymbolManager::getUnarySymExpr(const SymExpr *Operand,
   void *InsertPos;
   SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos);
   if (!data) {
-    data = new (BPAlloc) UnarySymExpr(Operand, Opc, T);
+    data = Alloc.make<UnarySymExpr>(Operand, Opc, T);
     DataSet.InsertNode(data, InsertPos);
   }
 
diff --git a/clang/test/Analysis/dump_egraph.cpp b/clang/test/Analysis/dump_egraph.cpp
index d1229b2634674..13459699a06f6 100644
--- a/clang/test/Analysis/dump_egraph.cpp
+++ b/clang/test/Analysis/dump_egraph.cpp
@@ -21,7 +21,7 @@ void foo() {
 
 // CHECK: \"location_context\": \"#0 Call\", \"calling\": \"T::T\", \"location\": \{ \"line\": 15, \"column\": 5, \"file\": \"{{.*}}dump_egraph.cpp\" \}, \"items\": [\l&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;\{ \"init_id\": {{[0-9]+}}, \"kind\": \"construct into member variable\", \"argument_index\": null, \"pretty\": \"s\", \"value\": \"&t.s\"
 
-// CHECK: \"cluster\": \"t\", \"pointer\": \"{{0x[0-9a-f]+}}\", \"items\": [\l&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;\{ \"kind\": \"Default\", \"offset\": 0, \"value\": \"conj_$2\{int, LC5, no stmt, #1\}\"
+// CHECK: \"cluster\": \"t\", \"pointer\": \"{{0x[0-9a-f]+}}\", \"items\": [\l&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;\{ \"kind\": \"Default\", \"offset\": 0, \"value\": \"conj_$3\{int, LC5, no stmt, #1\}\"
 
 // CHECK: \"dynamic_types\": [\l&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;\{ \"region\": \"HeapSymRegion\{conj_$1\{S *, LC1, S{{[0-9]+}}, #1\}\}\", \"dyn_type\": \"S\", \"sub_classable\": false \}\l
 
diff --git a/clang/test/Analysis/expr-inspection-printState-diseq-info.c b/clang/test/Analysis/expr-inspection-printState-diseq-info.c
index c5c31785a600e..515fcbbd43079 100644
--- a/clang/test/Analysis/expr-inspection-printState-diseq-info.c
+++ b/clang/test/Analysis/expr-inspection-printState-diseq-info.c
@@ -18,17 +18,17 @@ void test_disequality_info(int e0, int b0, int b1, int c0) {
  // CHECK-NEXT:     {
  // CHECK-NEXT:       "class": [ "(reg_$0<int e0>) - 2" ],
  // CHECK-NEXT:       "disequal_to": [
- // CHECK-NEXT:         [ "reg_$2<int b1>" ]]
+ // CHECK-NEXT:         [ "reg_$7<int b1>" ]]
  // CHECK-NEXT:     },
  // CHECK-NEXT:     {
- // CHECK-NEXT:       "class": [ "reg_$2<int b1>" ],
+ // CHECK-NEXT:       "class": [ "reg_$15<int c0>" ],
  // CHECK-NEXT:       "disequal_to": [
- // CHECK-NEXT:         [ "(reg_$0<int e0>) - 2" ],
- // CHECK-NEXT:         [ "reg_$3<int c0>" ]]
+ // CHECK-NEXT:         [ "reg_$7<int b1>" ]]
  // CHECK-NEXT:     },
  // CHECK-NEXT:     {
- // CHECK-NEXT:       "class": [ "reg_$3<int c0>" ],
+ // CHECK-NEXT:       "class": [ "reg_$7<int b1>" ],
  // CHECK-NEXT:       "disequal_to": [
- // CHECK-NEXT:         [ "reg_$2<int b1>" ]]
+ // CHECK-NEXT:         [ "(reg_$0<int e0>) - 2" ],
+ // CHECK-NEXT:         [ "reg_$15<int c0>" ]]
  // CHECK-NEXT:     }
  // CHECK-NEXT:   ],
diff --git a/clang/test/Analysis/expr-inspection-printState-eq-classes.c b/clang/test/Analysis/expr-inspection-printState-eq-classes.c
index 38e23d6e83826..19cc13735ab5a 100644
--- a/clang/test/Analysis/expr-inspection-printState-eq-classes.c
+++ b/clang/test/Analysis/expr-inspection-printState-eq-classes.c
@@ -16,6 +16,6 @@ void test_equivalence_classes(int a, int b, int c, int d) {
 }
 
 // CHECK:      "equivalence_classes": [
-// CHECK-NEXT:     [ "(reg_$0<int a>) != (reg_$2<int c>)" ],
-// CHECK-NEXT:     [ "reg_$0<int a>", "reg_$2<int c>", "reg_$3<int d>" ]
+// CHECK-NEXT:     [ "(reg_$0<int a>) != (reg_$5<int c>)" ],
+// CHECK-NEXT:     [ "reg_$0<int a>", "reg_$20<int d>", "reg_$5<int c>" ]
 // CHECK-NEXT: ],
diff --git a/clang/test/Analysis/ptr-arith.cpp b/clang/test/Analysis/ptr-arith.cpp
index a1264a1f04839..ec1c75c0c4063 100644
--- a/clang/test/Analysis/ptr-arith.cpp
+++ b/clang/test/Analysis/ptr-arith.cpp
@@ -139,10 +139,10 @@ struct parse_t {
 int parse(parse_t *p) {
   unsigned copy = p->bits2;
   clang_analyzer_dump(copy);
-  // expected-warning@-1 {{reg_$1<unsigned int Element{SymRegion{reg_$0<parse_t * p>},0 S64b,struct Bug_55934::parse_t}.bits2>}}
+  // expected-warning@-1 {{reg_$2<unsigned int Element{SymRegion{reg_$0<parse_t * p>},0 S64b,struct Bug_55934::parse_t}.bits2>}}
   header *bits = (header *)&copy;
   clang_analyzer_dump(bits->b);
-  // expected-warning@-1 {{derived_$2{reg_$1<unsigned int Element{SymRegion{reg_$0<parse_t * p>},0 S64b,struct Bug_55934::parse_t}.bits2>,Element{copy,0 S64b,struct Bug_55934::header}.b}}}
+  // expected-warning@-1 {{derived_$4{reg_$2<unsigned int Element{SymRegion{reg_$0<parse_t * p>},0 S64b,struct Bug_55934::parse_t}.bits2>,Element{copy,0 S64b,struct Bug_55934::header}.b}}}
   return bits->b; // no-warning
 }
 } // namespace Bug_55934
diff --git a/clang/test/Analysis/symbol-simplification-disequality-info.cpp b/clang/test/Analysis/symbol-simplification-disequality-info.cpp
index 69238b583eb84..33b8f150f5d02 100644
--- a/clang/test/Analysis/symbol-simplification-disequality-info.cpp
+++ b/clang/test/Analysis/symbol-simplification-disequality-info.cpp
@@ -14,14 +14,14 @@ void test(int a, int b, int c, int d) {
   clang_analyzer_printState();
   // CHECK:       "disequality_info": [
   // CHECK-NEXT:    {
-  // CHECK-NEXT:      "class": [ "((reg_$0<int a>) + (reg_$1<int b>)) + (reg_$2<int c>)" ],
+  // CHECK-NEXT:      "class": [ "((reg_$0<int a>) + (reg_$2<int b>)) + (reg_$5<int c>)" ],
   // CHECK-NEXT:      "disequal_to": [
-  // CHECK-NEXT:        [ "reg_$3<int d>" ]]
+  // CHECK-NEXT:        [ "reg_$8<int d>" ]]
   // CHECK-NEXT:    },
   // CHECK-NEXT:    {
-  // CHECK-NEXT:      "class": [ "reg_$3<int d>" ],
+  // CHECK-NEXT:      "class": [ "reg_$8<int d>" ],
   // CHECK-NEXT:      "disequal_to": [
-  // CHECK-NEXT:        [ "((reg_$0<int a>) + (reg_$1<int b>)) + (reg_$2<int c>)" ]]
+  // CHECK-NEXT:        [ "((reg_$0<int a>) + (reg_$2<int b>)) + (reg_$5<int c>)" ]]
   // CHECK-NEXT:    }
   // CHECK-NEXT:  ],
 
@@ -32,14 +32,14 @@ void test(int a, int b, int c, int d) {
   clang_analyzer_printState();
   // CHECK:      "disequality_info": [
   // CHECK-NEXT:   {
-  // CHECK-NEXT:     "class": [ "(reg_$0<int a>) + (reg_$2<int c>)" ],
+  // CHECK-NEXT:     "class": [ "(reg_$0<int a>) + (reg_$5<int c>)" ],
   // CHECK-NEXT:     "disequal_to": [
-  // CHECK-NEXT:       [ "reg_$3<int d>" ]]
+  // CHECK-NEXT:       [ "reg_$8<int d>" ]]
   // CHECK-NEXT:   },
   // CHECK-NEXT:   {
-  // CHECK-NEXT:     "class": [ "reg_$3<int d>" ],
+  // CHECK-NEXT:     "class": [ "reg_$8<int d>" ],
   // CHECK-NEXT:     "disequal_to": [
-  // CHECK-NEXT:        [ "(reg_$0<int a>) + (reg_$2<int c>)" ]]
+  // CHECK-NEXT:        [ "(reg_$0<int a>) + (reg_$5<int c>)" ]]
   // CHECK-NEXT:    }
   // CHECK-NEXT:  ],
 
@@ -50,10 +50,10 @@ void test(int a, int b, int c, int d) {
   // CHECK-NEXT:    {
   // CHECK-NEXT:      "class": [ "reg_$0<int a>" ],
   // CHECK-NEXT:      "disequal_to": [
-  // CHECK-NEXT:        [ "reg_$3<int d>" ]]
+  // CHECK-NEXT:        [ "reg_$8<int d>" ]]
   // CHECK-NEXT:    },
   // CHECK-NEXT:    {
-  // CHECK-NEXT:      "class": [ "reg_$3<int d>" ],
+  // CHECK-NEXT:      "class": [ "reg_$8<int d>" ],
   // CHECK-NEXT:      "disequal_to": [
   // CHECK-NEXT:        [ "reg_$0<int a>" ]]
   // CHECK-NEXT:    }
diff --git a/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp b/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp
index 73922d420a8c3..42e984762538e 100644
--- a/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp
+++ b/clang/test/Analysis/symbol-simplification-fixpoint-one-iteration.cpp
@@ -13,10 +13,10 @@ void test(int a, int b, int c) {
     return;
   clang_analyzer_printState();
   // CHECK:      "constraints": [
-  // CHECK-NEXT:   { "symbol": "((reg_$0<int a>) + (reg_$1<int b>)) != (reg_$2<int c>)", "range": "{ [0, 0] }" }
+  // CHECK-NEXT:   { "symbol": "((reg_$0<int a>) + (reg_$2<int b>)) != (reg_$5<int c>)", "range": "{ [0, 0] }" }
   // CHECK-NEXT: ],
   // CHECK-NEXT: "equivalence_classes": [
-  // CHECK-NEXT:   [ "(reg_$0<int a>) + (reg_$1<int b>)", "reg_$2<int c>" ]
+  // CHECK-NEXT:   [ "(reg_$0<int a>) + (reg_$2<int b>)", "reg_$5<int c>" ]
   // CHECK-NEXT: ],
   // CHECK-NEXT: "disequality_info": null,
 
@@ -25,12 +25,12 @@ void test(int a, int b, int c) {
     return;
   clang_analyzer_printState();
   // CHECK:        "constraints": [
-  // CHECK-NEXT:     { "symbol": "(reg_$0<int a>) != (reg_$2<int c>)", "range": "{ [0, 0] }" },
-  // CHECK-NEXT:     { "symbol": "reg_$1<int b>", "range": "{ [0, 0] }" }
+  // CHECK-NEXT:     { "symbol": "(reg_$0<int a>) != (reg_$5<int c>)", "range": "{ [0, 0] }" },
+  // CHECK-NEXT:     { "symbol": "reg_$2<int b>", "range": "{ [0, 0] }" }
   // CHECK-NEXT:   ],
   // CHECK-NEXT:   "equivalence_classes": [
-  // CHECK-NEXT:     [ "(reg_$0<int a>) != (reg_$2<int c>)" ],
-  // CHECK-NEXT:     [ "reg_$0<int a>", "reg_$2<int c>" ]
+  // CHECK-NEXT:     [ "(reg_$0<int a>) != (reg_$5<int c>)" ],
+  // CHECK-NEXT:     [ "reg_$0<int a>", "reg_$5<int c>" ]
   // CHECK-NEXT:   ],
   // CHECK-NEXT: "disequality_info": null,
 
diff --git a/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp b/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp
index 679ed3fda7a7a..cffb5a70869eb 100644
--- a/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp
+++ b/clang/test/Analysis/symbol-simplification-fixpoint-two-iterations.cpp
@@ -15,11 +15,11 @@ void test(int a, int b, int c, int d) {
     return;
   clang_analyzer_printState();
   // CHECK:      "constraints": [
-  // CHECK-NEXT:   { "symbol": "(((reg_$0<int a>) + (reg_$1<int b>)) + (reg_$2<int c>)) != (reg_$3<int d>)", "range": "{ [0, 0] }" },
-  // CHECK-NEXT:   { "symbol": "(reg_$2<int c>) + (reg_$1<int b>)", "range": "{ [0, 0] }" }
+  // CHECK-NEXT:   { "symbol": "(((reg_$0<int a>) + (reg_$2<int b>)) + (reg_$5<int c>)) != (reg_$8<int d>)", "range": "{ [0, 0] }" },
+  // CHECK-NEXT:   { "symbol": "(reg_$5<int c>) + (reg_$2<int b>)", "range": "{ [0, 0] }" }
   // CHECK-NEXT: ],
   // CHECK-NEXT: "equivalence_classes": [
-  // CHECK-NEXT:   [ "((reg_$0<int a>) + (reg_$1<int b>)) + (reg_$2<int c>)", "reg_$3<int d>" ]
+  // CHECK-NEXT:   [ "((reg_$0<int a>) + (reg_$2<int b>)) + (reg_$5<int c>)", "reg_$8<int d>" ]
   // CHECK-NEXT: ],
   // CHECK-NEXT: "disequality_info": null,
 
@@ -28,14 +28,14 @@ void test(int a, int b, int c, int d) {
     return;
   clang_analyzer_printState();
   // CHECK:       "constraints": [
-  // CHECK-NEXT:    { "symbol": "(reg_$0<int a>) != (reg_$3<int d>)", "range": "{ [0, 0] }" },
-  // CHECK-NEXT:    { "symbol": "reg_$1<int b>", "range": "{ [0, 0] }" },
-  // CHECK-NEXT:    { "symbol": "reg_$2<int c>", "range": "{ [0, 0] }" }
+  // CHECK-NEXT:    { "symbol": "(reg_$0<int a>) != (reg_$8<int d>)", "range": "{ [0, 0] }" },
+  // CHECK-NEXT:    { "symbol": "reg_$2<int b>", "range": "{ [0, 0] }" },
+  // CHECK-NEXT:    { "symbol": "reg_$5<int c>", "range": "{ [0, 0] }" }
   // CHECK-NEXT:  ],
   // CHECK-NEXT:  "equivalence_classes": [
-  // CHECK-NEXT:    [ "(reg_$0<int a>) != (reg_$3<int d>)" ],
-  // CHECK-NEXT:    [ "reg_$0<int a>", "reg_$3<int d>" ],
-  // CHECK-NEXT:    [ "reg_$2<int c>" ]
+  // CHECK-NEXT:    [ "(reg_$0<int a>) != (reg_$8<int d>)" ],
+  // CHECK-NEXT:    [ "reg_$0<int a>", "reg_$8<int d>" ],
+  // CHECK-NEXT:    [ "reg_$5<int c>" ]
   // CHECK-NEXT:  ],
   // CHECK-NEXT:  "disequality_info": null,
 
diff --git a/clang/test/Analysis/unary-sym-expr.c b/clang/test/Analysis/unary-sym-expr.c
index 92e11b295bee7..64a01a956c442 100644
--- a/clang/test/Analysis/unary-sym-expr.c
+++ b/clang/test/Analysis/unary-sym-expr.c
@@ -11,9 +11,9 @@ int test(int x, int y) {
   clang_analyzer_dump(-x);       // expected-warning{{-reg_$0<int x>}}
   clang_analyzer_dump(~x);       // expected-warning{{~reg_$0<int x>}}
   int z = x + y;
-  clang_analyzer_dump(-z);       // expected-warning{{-((reg_$0<int x>) + (reg_$1<int y>))}}
-  clang_analyzer_dump(-(x + y)); // expected-warning{{-((reg_$0<int x>) + (reg_$1<int y>))}}
-  clang_analyzer_dump(-x + y);   // expected-warning{{(-reg_$0<int x>) + (reg_$1<int y>)}}
+  clang_analyzer_dump(-z);       // expected-warning{{-((reg_$0<int x>) + (reg_$3<int y>))}}
+  clang_analyzer_dump(-(x + y)); // expected-warning{{-((reg_$0<int x>) + (reg_$3<int y>))}}
+  clang_analyzer_dump(-x + y);   // expected-warning{{(-reg_$0<int x>) + (reg_$3<int y>)}}
 
   if (-x == 0) {
     clang_analyzer_eval(-x == 0); // expected-warning{{TRUE}}

From a5f64c139c8c4b9ac03ba0bb91225d29b8846399 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Mon, 6 Jan 2025 12:02:28 +0000
Subject: [PATCH 29/49] [AArch64][SME] Disable inlining of callees with new ZT0
 state (#121338)

Inlining must be disabled for new-ZT0 callees as the callee is required
to save ZT0 and toggle PSTATE.ZA on entry.
---
 .../clang/Basic/DiagnosticFrontendKinds.td    |  2 ++
 clang/lib/CodeGen/Targets/AArch64.cpp         | 15 ++++++++--
 .../sme-inline-callees-streaming-attrs.c      | 13 ++++++--
 .../AArch64/sme-inline-streaming-attrs.c      | 12 +++++---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  2 +-
 .../Inline/AArch64/sme-pstateza-attrs.ll      | 30 +++++++++++++++++--
 6 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index 1ed379c76c8ea..f3593f5313340 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -291,6 +291,8 @@ def warn_function_always_inline_attribute_mismatch : Warning<
   "inlining may change runtime behaviour">, InGroup<AArch64SMEAttributes>;
 def err_function_always_inline_new_za : Error<
   "always_inline function %0 has new za state">;
+def err_function_always_inline_new_zt0
+    : Error<"always_inline function %0 has new zt0 state">;
 
 def warn_avx_calling_convention
     : Warning<"AVX vector %select{return|argument}0 of type %1 without '%2' "
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 0680b4828b6da..7db67ecba07c8 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -1169,8 +1169,9 @@ void AArch64TargetCodeGenInfo::checkFunctionABI(
 enum class ArmSMEInlinability : uint8_t {
   Ok = 0,
   ErrorCalleeRequiresNewZA = 1 << 0,
-  WarnIncompatibleStreamingModes = 1 << 1,
-  ErrorIncompatibleStreamingModes = 1 << 2,
+  ErrorCalleeRequiresNewZT0 = 1 << 1,
+  WarnIncompatibleStreamingModes = 1 << 2,
+  ErrorIncompatibleStreamingModes = 1 << 3,
 
   IncompatibleStreamingModes =
       WarnIncompatibleStreamingModes | ErrorIncompatibleStreamingModes,
@@ -1198,9 +1199,12 @@ static ArmSMEInlinability GetArmSMEInlinability(const FunctionDecl *Caller,
     else
       Inlinability |= ArmSMEInlinability::WarnIncompatibleStreamingModes;
   }
-  if (auto *NewAttr = Callee->getAttr<ArmNewAttr>())
+  if (auto *NewAttr = Callee->getAttr<ArmNewAttr>()) {
     if (NewAttr->isNewZA())
       Inlinability |= ArmSMEInlinability::ErrorCalleeRequiresNewZA;
+    if (NewAttr->isNewZT0())
+      Inlinability |= ArmSMEInlinability::ErrorCalleeRequiresNewZT0;
+  }
 
   return Inlinability;
 }
@@ -1227,6 +1231,11 @@ void AArch64TargetCodeGenInfo::checkFunctionCallABIStreaming(
       ArmSMEInlinability::ErrorCalleeRequiresNewZA)
     CGM.getDiags().Report(CallLoc, diag::err_function_always_inline_new_za)
         << Callee->getDeclName();
+
+  if ((Inlinability & ArmSMEInlinability::ErrorCalleeRequiresNewZT0) ==
+      ArmSMEInlinability::ErrorCalleeRequiresNewZT0)
+    CGM.getDiags().Report(CallLoc, diag::err_function_always_inline_new_zt0)
+        << Callee->getDeclName();
 }
 
 // If the target does not have floating-point registers, but we are using a
diff --git a/clang/test/CodeGen/AArch64/sme-inline-callees-streaming-attrs.c b/clang/test/CodeGen/AArch64/sme-inline-callees-streaming-attrs.c
index ce6f203631fc5..2071e66e0d652 100644
--- a/clang/test/CodeGen/AArch64/sme-inline-callees-streaming-attrs.c
+++ b/clang/test/CodeGen/AArch64/sme-inline-callees-streaming-attrs.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -target-feature +sme %s -DUSE_FLATTEN -o - | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -target-feature +sme %s -DUSE_ALWAYS_INLINE_STMT -o - | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -target-feature +sme -target-feature +sme2 %s -DUSE_FLATTEN -o - | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -target-feature +sme -target-feature +sme2 %s -DUSE_ALWAYS_INLINE_STMT -o - | FileCheck %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -20,6 +20,7 @@ void fn_streaming_compatible(void) __arm_streaming_compatible { was_inlined(); }
 void fn_streaming(void) __arm_streaming { was_inlined(); }
 __arm_locally_streaming void fn_locally_streaming(void) { was_inlined(); }
 __arm_new("za") void fn_streaming_new_za(void) __arm_streaming { was_inlined(); }
+__arm_new("zt0") void fn_streaming_new_zt0(void) __arm_streaming { was_inlined(); }
 
 FN_ATTR
 void caller(void) {
@@ -28,6 +29,7 @@ void caller(void) {
     STMT_ATTR fn_streaming();
     STMT_ATTR fn_locally_streaming();
     STMT_ATTR fn_streaming_new_za();
+    STMT_ATTR fn_streaming_new_zt0();
 }
 // CHECK-LABEL: void @caller()
 //  CHECK-NEXT: entry:
@@ -36,6 +38,7 @@ void caller(void) {
 //  CHECK-NEXT:   call void @fn_streaming
 //  CHECK-NEXT:   call void @fn_locally_streaming
 //  CHECK-NEXT:   call void @fn_streaming_new_za
+//  CHECK-NEXT:   call void @fn_streaming_new_zt0
 
 FN_ATTR void caller_streaming_compatible(void) __arm_streaming_compatible {
     STMT_ATTR fn();
@@ -43,6 +46,7 @@ FN_ATTR void caller_streaming_compatible(void) __arm_streaming_compatible {
     STMT_ATTR fn_streaming();
     STMT_ATTR fn_locally_streaming();
     STMT_ATTR fn_streaming_new_za();
+    STMT_ATTR fn_streaming_new_zt0();
 }
 // CHECK-LABEL: void @caller_streaming_compatible()
 //  CHECK-NEXT: entry:
@@ -51,6 +55,7 @@ FN_ATTR void caller_streaming_compatible(void) __arm_streaming_compatible {
 //  CHECK-NEXT:   call void @fn_streaming
 //  CHECK-NEXT:   call void @fn_locally_streaming
 //  CHECK-NEXT:   call void @fn_streaming_new_za
+//  CHECK-NEXT:   call void @fn_streaming_new_zt0
 
 FN_ATTR void caller_streaming(void) __arm_streaming {
     STMT_ATTR fn();
@@ -58,6 +63,7 @@ FN_ATTR void caller_streaming(void) __arm_streaming {
     STMT_ATTR fn_streaming();
     STMT_ATTR fn_locally_streaming();
     STMT_ATTR fn_streaming_new_za();
+    STMT_ATTR fn_streaming_new_zt0();
 }
 // CHECK-LABEL: void @caller_streaming()
 //  CHECK-NEXT: entry:
@@ -66,6 +72,7 @@ FN_ATTR void caller_streaming(void) __arm_streaming {
 //  CHECK-NEXT:   call void @was_inlined
 //  CHECK-NEXT:   call void @was_inlined
 //  CHECK-NEXT:   call void @fn_streaming_new_za
+//  CHECK-NEXT:   call void @fn_streaming_new_zt0
 
 FN_ATTR __arm_locally_streaming
 void caller_locally_streaming(void) {
@@ -74,6 +81,7 @@ void caller_locally_streaming(void) {
     STMT_ATTR fn_streaming();
     STMT_ATTR fn_locally_streaming();
     STMT_ATTR fn_streaming_new_za();
+    STMT_ATTR fn_streaming_new_zt0();
 }
 // CHECK-LABEL: void @caller_locally_streaming()
 //  CHECK-NEXT: entry:
@@ -82,3 +90,4 @@ void caller_locally_streaming(void) {
 //  CHECK-NEXT:   call void @was_inlined
 //  CHECK-NEXT:   call void @was_inlined
 //  CHECK-NEXT:   call void @fn_streaming_new_za
+//  CHECK-NEXT:   call void @fn_streaming_new_zt0
diff --git a/clang/test/CodeGen/AArch64/sme-inline-streaming-attrs.c b/clang/test/CodeGen/AArch64/sme-inline-streaming-attrs.c
index 9c3d08a25945a..68102c9ded40c 100644
--- a/clang/test/CodeGen/AArch64/sme-inline-streaming-attrs.c
+++ b/clang/test/CodeGen/AArch64/sme-inline-streaming-attrs.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_NONE %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_COMPATIBLE %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_STREAMING %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_LOCALLY %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -target-feature +sme2 -verify -DTEST_NONE %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -target-feature +sme2 -verify -DTEST_COMPATIBLE %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -target-feature +sme2 -verify -DTEST_STREAMING %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -target-feature +sme2 -verify -DTEST_LOCALLY %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -10,6 +10,8 @@ __ai void inlined_fn(void) {}
 __ai void inlined_fn_streaming_compatible(void) __arm_streaming_compatible {}
 __ai void inlined_fn_streaming(void) __arm_streaming {}
 __ai __arm_locally_streaming void inlined_fn_local(void) {}
+__ai __arm_new("za") void inlined_fn_za(void) {}
+__ai __arm_new("zt0") void inlined_fn_zt0(void) {}
 
 #ifdef TEST_NONE
 void caller(void) {
@@ -17,6 +19,8 @@ void caller(void) {
     inlined_fn_streaming_compatible();
     inlined_fn_streaming(); // expected-error {{always_inline function 'inlined_fn_streaming' and its caller 'caller' have mismatching streaming attributes}}
     inlined_fn_local(); // expected-error {{always_inline function 'inlined_fn_local' and its caller 'caller' have mismatching streaming attributes}}
+    inlined_fn_za(); // expected-error {{always_inline function 'inlined_fn_za' has new za state}}
+    inlined_fn_zt0(); // expected-error {{always_inline function 'inlined_fn_zt0' has new zt0 state}}
 }
 #endif
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 7c5e5336b6531..72cc0bf7e7dd5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -259,7 +259,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
     CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
   }
 
-  if (CalleeAttrs.isNewZA())
+  if (CalleeAttrs.isNewZA() || CalleeAttrs.isNewZT0())
     return false;
 
   if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll
index 7ffbd64c700aa..4cd1491611be0 100644
--- a/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll
+++ b/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll
@@ -391,9 +391,33 @@ define void @nonzt0_callee() {
   ret void
 }
 
+define void @new_zt0_callee() "aarch64_new_zt0" {
+; CHECK-LABEL: define void @new_zt0_callee
+; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @inlined_body()
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; inlineasm", ""()
+  call void @inlined_body()
+  ret void
+}
+
+define void @nonzt0_caller_new_zt0_callee_dont_inline() {
+; CHECK-LABEL: define void @nonzt0_caller_new_zt0_callee_dont_inline
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @new_zt0_callee()
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @new_zt0_callee()
+  ret void
+}
+
 define void @shared_zt0_caller_nonzt0_callee_dont_inline() "aarch64_inout_zt0" {
 ; CHECK-LABEL: define void @shared_zt0_caller_nonzt0_callee_dont_inline
-; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR5:[0-9]+]] {
 ; CHECK-NEXT:    call void @nonzt0_callee()
 ; CHECK-NEXT:    ret void
 ;
@@ -403,7 +427,7 @@ define void @shared_zt0_caller_nonzt0_callee_dont_inline() "aarch64_inout_zt0" {
 
 define void @shared_zt0_callee() "aarch64_inout_zt0" {
 ; CHECK-LABEL: define void @shared_zt0_callee
-; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-SAME: () #[[ATTR5]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
@@ -415,7 +439,7 @@ define void @shared_zt0_callee() "aarch64_inout_zt0" {
 
 define void @shared_zt0_caller_shared_zt0_callee_inline() "aarch64_inout_zt0" {
 ; CHECK-LABEL: define void @shared_zt0_caller_shared_zt0_callee_inline
-; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-SAME: () #[[ATTR5]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void

From 92ae9760a0981dbae5b2392c15e842c0c9bcf3b2 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Mon, 6 Jan 2025 07:12:07 -0500
Subject: [PATCH 30/49] [mlir] DCE `friend Dialect::registerDialect` (#121728)

---
 mlir/include/mlir/IR/Dialect.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mlir/include/mlir/IR/Dialect.h b/mlir/include/mlir/IR/Dialect.h
index f3e5f6d88c531..fb24a6895dabd 100644
--- a/mlir/include/mlir/IR/Dialect.h
+++ b/mlir/include/mlir/IR/Dialect.h
@@ -368,7 +368,6 @@ class Dialect {
   DenseSet<std::pair<TypeID, TypeID>> unresolvedPromisedInterfaces;
 
   friend class DialectRegistry;
-  friend void registerDialect();
   friend class MLIRContext;
 };
 

From a002befb41675de235cd24913857754fde60957e Mon Sep 17 00:00:00 2001
From: Vikash Gupta <Vikash.Gupta@amd.com>
Date: Mon, 6 Jan 2025 17:42:38 +0530
Subject: [PATCH 31/49] [AMDGPU] [GlobalIsel] Combine Fmul with Select into
 ldexp instruction. (#120104)

This combine pattern perform the below transformation.

fmul x, select(y, A, B)      -> fldexp (x, select i32 (y, a, b))
fmul x, select(y, -A, -B)   -> fldexp ((fneg x), select i32 (y, a, b))

where, A=2^a & B=2^b ; a and b are integers.

It is a follow-up PR to implement the above combine for globalIsel, as
the corresponding DAG combine has been done for SelectionDAG Isel
(#111109)
---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td       |   14 +-
 .../Target/AMDGPU/AMDGPUCombinerHelper.cpp    |   71 +
 llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h |   13 +
 .../AMDGPU/AMDGPUPostLegalizerCombiner.cpp    |    2 +-
 .../AMDGPU/AMDGPUPreLegalizerCombiner.cpp     |    2 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll   |  769 +--
 .../CodeGen/AMDGPU/GlobalISel/llvm.powi.ll    |   95 +-
 .../CodeGen/AMDGPU/dagcombine-fmul-sel.ll     | 5208 ++++++++++-------
 llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll    |   10 +-
 llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll     | 2114 ++++---
 llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll         |   28 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp2.ll         | 1230 ++--
 llvm/test/CodeGen/AMDGPU/llvm.log.ll          | 1209 ++--
 llvm/test/CodeGen/AMDGPU/llvm.log10.ll        | 1209 ++--
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         | 1542 +++--
 .../AMDGPU/pseudo-scalar-transcendental.ll    |   97 +-
 16 files changed, 8050 insertions(+), 5563 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 985fa8f1deff9..da47aaf8a3b5c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -124,6 +124,16 @@ def sign_extension_in_reg : GICombineRule<
          [{ return matchCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }]),
   (apply [{ applyCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }])>;
 
+// Do the following combines :
+// fmul x, select(y, A, B)   -> fldexp (x, select i32 (y, a, b))
+// fmul x, select(y, -A, -B) -> fldexp ((fneg x), select i32 (y, a, b))
+def combine_fmul_with_select_to_fldexp : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match  (G_FMUL $dst, $x, $select):$root,
+          (G_SELECT $select, $y, $A, $B):$sel,
+          [{ return Helper.matchCombineFmulWithSelectToFldexp(*${root}, *${sel}, ${matchinfo}); }]),
+  (apply  [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
 
 let Predicates = [Has16BitInsts, NotHasMed3_16] in {
 // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
@@ -153,13 +163,13 @@ def gfx8_combines : GICombineGroup<[expand_promoted_fmed3]>;
 
 def AMDGPUPreLegalizerCombiner: GICombiner<
   "AMDGPUPreLegalizerCombinerImpl",
-  [all_combines, clamp_i64_to_i16, foldable_fneg]> {
+  [all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16, foldable_fneg]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 
 def AMDGPUPostLegalizerCombiner: GICombiner<
   "AMDGPUPostLegalizerCombinerImpl",
-  [all_combines, gfx6gfx7_combines, gfx8_combines,
+  [all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp,
    uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
    rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64]> {
   let CombineAllMethodName = "tryCombineAllImpl";
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index 6fa8117004899..9c25653f6b61e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -17,6 +17,13 @@
 using namespace llvm;
 using namespace MIPatternMatch;
 
+AMDGPUCombinerHelper::AMDGPUCombinerHelper(
+    GISelChangeObserver &Observer, MachineIRBuilder &B, bool IsPreLegalize,
+    GISelKnownBits *KB, MachineDominatorTree *MDT, const LegalizerInfo *LI,
+    const GCNSubtarget &STI)
+    : CombinerHelper(Observer, B, IsPreLegalize, KB, MDT, LI), STI(STI),
+      TII(*STI.getInstrInfo()) {}
+
 LLVM_READNONE
 static bool fnegFoldsIntoMI(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
@@ -445,3 +452,67 @@ void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI,
   Builder.buildFMinNumIEEE(MI.getOperand(0), B1, C1);
   MI.eraseFromParent();
 }
+
+bool AMDGPUCombinerHelper::matchCombineFmulWithSelectToFldexp(
+    MachineInstr &MI, MachineInstr &Sel,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FMUL);
+  assert(Sel.getOpcode() == TargetOpcode::G_SELECT);
+  assert(MI.getOperand(2).getReg() == Sel.getOperand(0).getReg());
+
+  Register Dst = MI.getOperand(0).getReg();
+  LLT DestTy = MRI.getType(Dst);
+  LLT ScalarDestTy = DestTy.getScalarType();
+
+  if ((ScalarDestTy != LLT::float64() && ScalarDestTy != LLT::float32() &&
+       ScalarDestTy != LLT::float16()) ||
+      !MRI.hasOneNonDBGUse(Sel.getOperand(0).getReg()))
+    return false;
+
+  Register SelectCondReg = Sel.getOperand(1).getReg();
+  MachineInstr *SelectTrue = MRI.getVRegDef(Sel.getOperand(2).getReg());
+  MachineInstr *SelectFalse = MRI.getVRegDef(Sel.getOperand(3).getReg());
+
+  const auto SelectTrueVal =
+      isConstantOrConstantSplatVectorFP(*SelectTrue, MRI);
+  if (!SelectTrueVal)
+    return false;
+  const auto SelectFalseVal =
+      isConstantOrConstantSplatVectorFP(*SelectFalse, MRI);
+  if (!SelectFalseVal)
+    return false;
+
+  if (SelectTrueVal->isNegative() != SelectFalseVal->isNegative())
+    return false;
+
+  // For f32, only non-inline constants should be transformed.
+  if (ScalarDestTy == LLT::float32() && TII.isInlineConstant(*SelectTrueVal) &&
+      TII.isInlineConstant(*SelectFalseVal))
+    return false;
+
+  int SelectTrueLog2Val = SelectTrueVal->getExactLog2Abs();
+  if (SelectTrueLog2Val == INT_MIN)
+    return false;
+  int SelectFalseLog2Val = SelectFalseVal->getExactLog2Abs();
+  if (SelectFalseLog2Val == INT_MIN)
+    return false;
+
+  MatchInfo = [=, &MI](MachineIRBuilder &Builder) {
+    LLT IntDestTy = DestTy.changeElementType(LLT::scalar(32));
+    auto NewSel = Builder.buildSelect(
+        IntDestTy, SelectCondReg,
+        Builder.buildConstant(IntDestTy, SelectTrueLog2Val),
+        Builder.buildConstant(IntDestTy, SelectFalseLog2Val));
+
+    Register XReg = MI.getOperand(1).getReg();
+    if (SelectTrueVal->isNegative()) {
+      auto NegX =
+          Builder.buildFNeg(DestTy, XReg, MRI.getVRegDef(XReg)->getFlags());
+      Builder.buildFLdexp(Dst, NegX, NewSel, MI.getFlags());
+    } else {
+      Builder.buildFLdexp(Dst, XReg, NewSel, MI.getFlags());
+    }
+  };
+
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
index 30601126e833b..bc3d9daef87c5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
@@ -15,13 +15,22 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOMBINERHELPER_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOMBINERHELPER_H
 
+#include "GCNSubtarget.h"
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 
 namespace llvm {
 class AMDGPUCombinerHelper : public CombinerHelper {
+protected:
+  const GCNSubtarget &STI;
+  const SIInstrInfo &TII;
+
 public:
   using CombinerHelper::CombinerHelper;
+  AMDGPUCombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B,
+                       bool IsPreLegalize, GISelKnownBits *KB,
+                       MachineDominatorTree *MDT, const LegalizerInfo *LI,
+                       const GCNSubtarget &STI);
 
   bool matchFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo) const;
   void applyFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo) const;
@@ -30,6 +39,10 @@ class AMDGPUCombinerHelper : public CombinerHelper {
                                    Register Src1, Register Src2) const;
   void applyExpandPromotedF16FMed3(MachineInstr &MI, Register Src0,
                                    Register Src1, Register Src2) const;
+
+  bool matchCombineFmulWithSelectToFldexp(
+      MachineInstr &MI, MachineInstr &Sel,
+      std::function<void(MachineIRBuilder &)> &MatchInfo) const;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 54d927c33fc55..888817e52e35d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -134,7 +134,7 @@ AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl(
     const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI)
     : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI),
       TII(*STI.getInstrInfo()),
-      Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI),
+      Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI, STI),
 #define GET_GICOMBINER_CONSTRUCTOR_INITS
 #include "AMDGPUGenPostLegalizeGICombiner.inc"
 #undef GET_GICOMBINER_CONSTRUCTOR_INITS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index ac431ccc30903..52c6e5274ae5b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -93,7 +93,7 @@ AMDGPUPreLegalizerCombinerImpl::AMDGPUPreLegalizerCombinerImpl(
     const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig,
     const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI)
     : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI),
-      Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI),
+      Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI, STI),
 #define GET_GICOMBINER_CONSTRUCTOR_INITS
 #include "AMDGPUGenPreLegalizeGICombiner.inc"
 #undef GET_GICOMBINER_CONSTRUCTOR_INITS
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
index 0577117e9d9e1..d81faf91801b0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
@@ -10,10 +10,10 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -25,19 +25,19 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -49,19 +49,19 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -73,17 +73,18 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
@@ -91,32 +92,34 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %pow = call float @llvm.pow.f32(float %x, float %y)
   ret float %pow
@@ -127,111 +130,114 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v5, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v6
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, 1.0, v5, s[4:5]
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; GFX6-NEXT:    v_log_f32_e32 v1, v1
-; GFX6-NEXT:    v_mov_b32_e32 v6, 0x42000000
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v7
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, v6, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x42000000
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
+; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v6
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, v5, s[4:5]
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; GFX6-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0x42800000
+; GFX6-NEXT:    v_mov_b32_e32 v6, 0x42800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GFX6-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, v7, s[4:5]
+; GFX6-NEXT:    v_add_f32_e32 v0, v0, v7
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[4:5]
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    v_not_b32_e32 v4, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, 1.0, v5, s[4:5]
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GFX8-NEXT:    v_ldexp_f32 v1, v1, v4
 ; GFX8-NEXT:    v_log_f32_e32 v1, v1
-; GFX8-NEXT:    v_mov_b32_e32 v6, 0x42000000
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, v6, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x42000000
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, v5, s[4:5]
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0x42800000
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0x42800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
 ; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v7, s[4:5]
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v7
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[4:5]
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_exp_f32_e32 v1, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX8-NEXT:    v_not_b32_e32 v4, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX8-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 1.0, v5, s[4:5]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_ldexp_f32 v1, v1, v4
 ; GFX9-NEXT:    v_log_f32_e32 v1, v1
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x42000000
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, v6, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x42000000
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, v5, s[4:5]
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x42800000
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x42800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v7, s[4:5]
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[4:5]
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_exp_f32_e32 v1, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX9-NEXT:    v_not_b32_e32 v4, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[4:5]
+; GFX9-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_v2f32:
@@ -239,10 +245,12 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s4
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v4
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 0x42000000, s4
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
@@ -257,46 +265,54 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s4
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x1f800000, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x1f800000, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s4
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_exp_f32_e32 v1, v1
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_v2f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s0
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v4 :: v_dual_mul_f32 v1, v1, v5
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 0x42000000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    v_log_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_dx9_zero_f32 v0, v0, v2 :: v_dual_mul_dx9_zero_f32 v1, v1, v3
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v5 :: v_dual_lshlrev_b32 v4, 5, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42800000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x1f800000, s0
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3
+; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_exp_f32_e32 v1, v1
+; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %pow = call <2 x float> @llvm.pow.v2f32(<2 x float> %x, <2 x float> %y)
   ret <2 x float> %pow
@@ -316,9 +332,9 @@ define half @v_pow_f16(half %x, half %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -388,18 +404,18 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v3, vcc
+; GFX6-NEXT:    v_not_b32_e32 v3, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v3, vcc
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v6
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -508,17 +524,17 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX6-NEXT:    v_mov_b32_e32 v5, 0x1f800000
+; GFX6-NEXT:    v_not_b32_e32 v5, 63
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v2, v0
-; GFX6-NEXT:    v_mul_f32_e32 v0, v1, v6
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v5, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v1, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -634,17 +650,17 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v5
-; GFX6-NEXT:    v_mov_b32_e32 v5, 0x1f800000
+; GFX6-NEXT:    v_not_b32_e32 v5, 63
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v5, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v6
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v6
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -764,17 +780,17 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX6-NEXT:    v_mov_b32_e32 v5, 0x1f800000
+; GFX6-NEXT:    v_not_b32_e32 v5, 63
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
 ; GFX6-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v0
-; GFX6-NEXT:    v_mul_f32_e32 v0, v2, v6
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v5, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -885,10 +901,10 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, |v0|, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -900,19 +916,19 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fabs_lhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -924,19 +940,19 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fabs_lhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -948,17 +964,18 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fabs_lhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, |v0|
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4
-; GFX10-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s4
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
@@ -966,9 +983,9 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fabs_lhs:
@@ -976,23 +993,24 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
-; GFX11-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %pow = call float @llvm.pow.f32(float %fabs.x, float %y)
@@ -1004,10 +1022,10 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1019,19 +1037,19 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fabs_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1043,19 +1061,19 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fabs_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1067,17 +1085,18 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fabs_rhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
@@ -1085,32 +1104,34 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fabs_rhs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e64 v0, v0, |v1|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.y = call float @llvm.fabs.f32(float %y)
   %pow = call float @llvm.pow.f32(float %x, float %fabs.y)
@@ -1122,10 +1143,10 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, |v0|, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1137,19 +1158,19 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1161,19 +1182,19 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1185,17 +1206,18 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, |v0|
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4
-; GFX10-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s4
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
@@ -1203,9 +1225,9 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fabs_lhs_rhs:
@@ -1213,23 +1235,24 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
-; GFX11-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v0, |v0|, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e64 v0, v0, |v1|
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fabs.y = call float @llvm.fabs.f32(float %y)
@@ -1241,10 +1264,10 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
 ; GFX6-LABEL: v_pow_f32_sgpr_vgpr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v1, s0, v1
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX6-NEXT:    v_ldexp_f32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_log_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1256,18 +1279,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_pow_f32_sgpr_vgpr:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v1, s0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX8-NEXT:    v_ldexp_f32 v1, s0, v1
 ; GFX8-NEXT:    v_log_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1279,18 +1302,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_pow_f32_sgpr_vgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v1, s0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX9-NEXT:    v_ldexp_f32 v1, s0, v1
 ; GFX9-NEXT:    v_log_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1302,49 +1325,51 @@ define amdgpu_ps float @v_pow_f32_sgpr_vgpr(float inreg %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_pow_f32_sgpr_vgpr:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s1
-; GFX10-NEXT:    v_mul_f32_e32 v1, s0, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX10-NEXT:    v_ldexp_f32 v1, s0, v1
 ; GFX10-NEXT:    v_log_f32_e32 v1, v1
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_pow_f32_sgpr_vgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s1, 0x800000, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s1
-; GFX11-NEXT:    v_mul_f32_e32 v1, s0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v1, s0, v1
 ; GFX11-NEXT:    v_log_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %pow = call float @llvm.pow.f32(float %x, float %y)
   ret float %pow
@@ -1354,10 +1379,10 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
 ; GFX6-LABEL: v_pow_f32_vgpr_sgpr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1369,18 +1394,18 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_pow_f32_vgpr_sgpr:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1392,18 +1417,18 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_pow_f32_vgpr_sgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1415,16 +1440,17 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_pow_f32_vgpr_sgpr:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1432,31 +1458,33 @@ define amdgpu_ps float @v_pow_f32_vgpr_sgpr(float %x, float inreg %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_pow_f32_vgpr_sgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %pow = call float @llvm.pow.f32(float %x, float %y)
   ret float %pow
@@ -1466,10 +1494,10 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
 ; GFX6-LABEL: v_pow_f32_sgpr_sgpr:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1481,18 +1509,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_pow_f32_sgpr_sgpr:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX8-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1504,18 +1532,18 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_pow_f32_sgpr_sgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX9-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1527,49 +1555,51 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_pow_f32_sgpr_sgpr:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s2, 0x800000, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s2
-; GFX10-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX10-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_pow_f32_sgpr_sgpr:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s2, 0x800000, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s2
-; GFX11-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, s1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %pow = call float @llvm.pow.f32(float %x, float %y)
   ret float %pow
@@ -1580,10 +1610,10 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT:    v_ldexp_f32_e64 v0, -v0, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1595,19 +1625,19 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fneg_lhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, -v0, v2
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1619,19 +1649,19 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fneg_lhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, -v0, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1643,17 +1673,18 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fneg_lhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, -v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4
-; GFX10-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, -v0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s4
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
@@ -1661,9 +1692,9 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fneg_lhs:
@@ -1671,23 +1702,24 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
-; GFX11-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f32 v0, -v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg float %x
   %pow = call float @llvm.pow.f32(float %neg.x, float %y)
@@ -1699,10 +1731,10 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1714,19 +1746,19 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_not_b32_e32 v1, 63
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fneg_rhs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1738,19 +1770,19 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fneg_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
@@ -1762,17 +1794,18 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_not_b32_e32 v1, 63
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fneg_rhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
@@ -1780,32 +1813,34 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fneg_rhs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e64 v0, v0, -v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.y = fneg float %y
   %pow = call float @llvm.pow.f32(float %x, float %neg.y)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
index eeb7b138fde31..fe002d69faf66 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
@@ -18,9 +18,9 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
 ; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_exp_f32_e32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_not_b32_e32 v1, 63
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -75,53 +75,80 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
 }
 
 define float @v_powi_f32(float %l, i32 %r) {
-; GFX78-LABEL: v_powi_f32:
-; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX78-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GFX78-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX78-NEXT:    v_log_f32_e32 v0, v0
-; GFX78-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42000000
-; GFX78-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX78-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GFX78-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX78-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX78-NEXT:    v_exp_f32_e32 v0, v0
-; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX78-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_powi_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x800000
+; GFX7-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_log_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX7-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_exp_f32_e32 v0, v0
+; GFX7-NEXT:    v_not_b32_e32 v1, 63
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_powi_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
+; GFX8-NEXT:    v_log_f32_e32 v0, v0
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_exp_f32_e32 v0, v0
+; GFX8-NEXT:    v_not_b32_e32 v1, 63
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 %r)
   ret float %res
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
index 5b72795ba07ea..b128be2186df2 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX1030 %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX1100 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
 
 define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX7-LABEL: fmul_select_f32_test1:
@@ -21,22 +25,22 @@ define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test1:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test1:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 2.000000e+00, float 1.000000e+00
   %ldexp = fmul float %x, %y
@@ -60,22 +64,22 @@ define float @fmul_select_f32_test2(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test2:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test2:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 5.000000e-01, float 1.000000e+00
   %ldexp = fmul float %x, %y
@@ -83,49 +87,71 @@ define float @fmul_select_f32_test2(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define <2 x float> @fmul_select_v2f32_test3(<2 x float> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f32_test3:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f32_test3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f32_test3:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f32_test3:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f32_test3:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f32_test3:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f32_test3:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f32_test3:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_select_v2f32_test3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f32_test3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
   %ldexp = fmul <2 x float> %x, %y
@@ -133,49 +159,71 @@ define <2 x float> @fmul_select_v2f32_test3(<2 x float> %x, <2 x i32> %bool.arg1
 }
 
 define <2 x float> @fmul_select_v2f32_test4(<2 x float> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f32_test4:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f32_test4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f32_test4:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f32_test4:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
-; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f32_test4:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f32_test4:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f32_test4:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f32_test4:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_select_v2f32_test4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f32_test4:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x float> <float 5.000000e-01, float 5.000000e-01>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
   %ldexp = fmul <2 x float> %x, %y
@@ -199,22 +247,22 @@ define float @fmul_select_f32_test5(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test5:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test5:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float -2.000000e+00, float -1.000000e+00
   %ldexp = fmul float %x, %y
@@ -222,44 +270,83 @@ define float @fmul_select_f32_test5(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define float @fmul_select_f32_test6(float %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f32_test6:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc0400000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f32_test6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc0400000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f32_test6:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xc0400000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test6:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v3, 0xc0400000
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f32_test6:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc0400000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f32_test6:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0400000
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41000000
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f32_test6:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc0400000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f32_test6:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0400000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41000000
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f32_test6:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc0400000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f32_test6:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc0400000, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f32_test6:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc0400000
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f32_test6:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc0400000, vcc_lo
+; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float -3.000000e+00, float 8.000000e+00
   %ldexp = fmul float %x, %y
@@ -285,22 +372,22 @@ define float @fmul_select_f32_test7_sel_log2val_pos59_pos92(float %x, i32 %bool.
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 0x43A0000000000000, float 0x45B0000000000000
   %ldexp = fmul float %x, %y
@@ -308,44 +395,83 @@ define float @fmul_select_f32_test7_sel_log2val_pos59_pos92(float %x, i32 %bool.
 }
 
 define float @fmul_select_f32_test8(float %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f32_test8:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0xc1000000
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0x41800000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f32_test8:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc1000000
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x41800000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f32_test8:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x41800000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test8:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v3, 0x41800000
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f32_test8:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc1000000
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0x41800000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f32_test8:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc1000000
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f32_test8:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc1000000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x41800000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f32_test8:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc1000000
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f32_test8:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f32_test8:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1000000
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x41800000, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f32_test8:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f32_test8:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1000000
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x41800000, vcc_lo
+; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 1.600000e+01, float -8.000000e+00
   %ldexp = fmul float %x, %y
@@ -369,22 +495,22 @@ define float @fmul_select_f32_test9(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test9:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test9:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test9:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test9:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 0.000000e+00, float 2.000000e+00
   %ldexp = fmul float %x, %y
@@ -410,22 +536,22 @@ define float @fmul_select_f32_test10(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test10:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test10:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test10:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test10:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float -0.000000e+00, float 0.000000e+00
   %ldexp = fmul float %x, %y
@@ -451,22 +577,22 @@ define float @fmul_select_f32_test11_sel_log2val_pos78_pos56(float %x, i32 %bool
 ; GFX9-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f32 v0, -v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f32 v0, -v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo
+; GFX10-NEXT:    v_ldexp_f32 v0, -v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f32 v0, -v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 0xC4D0000000000000, float 0xC370000000000000
   %ldexp = fmul float %x, %y
@@ -474,44 +600,83 @@ define float @fmul_select_f32_test11_sel_log2val_pos78_pos56(float %x, i32 %bool
 }
 
 define float @fmul_select_f32_test12_sel_log2val_neg48_pos68(float %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x44
-; GFX7-NEXT:    v_not_b32_e32 v4, 47
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x44
-; GFX9-NEXT:    v_not_b32_e32 v4, 47
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_not_b32_e32 v3, 47
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_not_b32_e32 v3, 47
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x44
+; GFX7-SDAG-NEXT:    v_not_b32_e32 v4, 47
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_not_b32_e32 v3, 47
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0x44
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x44
+; GFX9-SDAG-NEXT:    v_not_b32_e32 v4, 47
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_not_b32_e32 v3, 47
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x44
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_not_b32_e32 v3, 47
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x44
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xffffffd0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_not_b32_e32 v3, 47
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x44
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xffffffd0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, float 0x3CF0000000000000, float 0x4430000000000000
   %ldexp = fmul float %x, %y
@@ -535,22 +700,22 @@ define double @fmul_select_f64_test1(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f64_test1:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test1:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 2.000000e+00, double 1.000000e+00
   %ldexp = fmul double %x, %y
@@ -574,22 +739,22 @@ define double @fmul_select_f64_test2(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f64_test2:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test2:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 5.000000e-01, double 1.000000e+00
   %ldexp = fmul double %x, %y
@@ -619,28 +784,28 @@ define <2 x double> @fmul_select_v2f64_test3(<2 x double> %x, <2 x i32> %bool.ar
 ; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_v2f64_test3:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1030-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f64_test3:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1100-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_v2f64_test3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f64_test3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
   %ldexp = fmul <2 x double> %x, %y
@@ -670,28 +835,28 @@ define <2 x double> @fmul_select_v2f64_test4(<2 x double> %x, <2 x i32> %bool.ar
 ; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_v2f64_test4:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1030-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f64_test4:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1100-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_v2f64_test4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f64_test4:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
+; GFX11-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x double> <double 5.000000e-01, double 5.000000e-01>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
   %ldexp = fmul <2 x double> %x, %y
@@ -715,22 +880,22 @@ define double @fmul_select_f64_test5(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f64_test5:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test5:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double -5.000000e-01, double -1.000000e+00
   %ldexp = fmul double %x, %y
@@ -754,22 +919,22 @@ define double @fmul_select_f64_test6(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f64_test6:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test6:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test6:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test6:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double -2.000000e+00, double -1.000000e+00
   %ldexp = fmul double %x, %y
@@ -777,44 +942,64 @@ define double @fmul_select_f64_test6(double %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define double @fmul_select_f64_test7(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test7:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xbff00000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v4, 2.0, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xbff00000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, 2.0, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test7:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo
-; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test7:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test7:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbff00000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, v4, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test7:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v5, 0xbff00000
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, 2.0, vcc
+; GFX7-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test7:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbff00000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v4, 2.0, vcc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test7:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xbff00000
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, 2.0, vcc
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_select_f64_test7:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test7:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 2.000000e+00, double -1.000000e+00
   %ldexp = fmul double %x, %y
@@ -838,22 +1023,22 @@ define double @fmul_select_f64_test8(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f64_test8:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 5, 2, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test8:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 5, 2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 5, 2, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 5, 2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double -4.000000e+00, double -3.200000e+01
   %ldexp = fmul double %x, %y
@@ -883,28 +1068,28 @@ define <2 x double> @fmul_select_v2f64_test9(<2 x double> %x, <2 x i32> %bool.ar
 ; GFX9-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_v2f64_test9:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1030-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v4
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v5
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f64_test9:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1100-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v4
-; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v5
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_v2f64_test9:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_v2f64_test9:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x double> <double -2.000000e+00, double -2.000000e+00>, <2 x double> <double -1.000000e+00, double -1.000000e+00>
   %ldexp = fmul <2 x double> %x, %y
@@ -912,60 +1097,115 @@ define <2 x double> @fmul_select_v2f64_test9(<2 x double> %x, <2 x i32> %bool.ar
 }
 
 define <2 x double> @fmul_select_v2f64_test10(<2 x double> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f64_test10:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v8, 0xbff00000
-; GFX7-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX7-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX7-NEXT:    v_mov_b32_e32 v8, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX7-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
-; GFX7-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f64_test10:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0xbff00000
-; GFX9-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
-; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f64_test10:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v8, 0x3fe00000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1030-NEXT:    v_cndmask_b32_e32 v9, 0xbff00000, v8, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1030-NEXT:    v_mov_b32_e32 v8, 0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
-; GFX1030-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f64_test10:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v8, 0x3fe00000
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_dual_cndmask_b32 v9, 0xbff00000, v8 :: v_dual_mov_b32 v8, 0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
-; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
-; GFX1100-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f64_test10:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v8, 0xbff00000
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX7-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX7-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f64_test10:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v8, 0
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX7-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f64_test10:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v8, 0xbff00000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f64_test10:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2f64_test10:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v8, 0x3fe00000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v9, 0xbff00000, v8, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX10-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2f64_test10:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v9, 0xbff00000
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, 0x3fe00000, vcc_lo
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2f64_test10:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v8, 0x3fe00000
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_dual_cndmask_b32 v9, 0xbff00000, v8 :: v_dual_mov_b32 v8, 0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2f64_test10:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v9, 0xbff00000 :: v_dual_mov_b32 v8, 0
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, 0x3fe00000, vcc_lo
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x double> <double 5.000000e-01, double 2.000000e+00>, <2 x double> <double -1.000000e+00, double 1.000000e+00>
   %ldexp = fmul <2 x double> %x, %y
@@ -973,44 +1213,64 @@ define <2 x double> @fmul_select_v2f64_test10(<2 x double> %x, <2 x i32> %bool.a
 }
 
 define double @fmul_select_f64_test11(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test11:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, v4, -2.0, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test11:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, -2.0, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test11:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo
-; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test11:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test11:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, v4, -2.0, vcc
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test11:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, -2.0, vcc
+; GFX7-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test11:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v4, -2.0, vcc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test11:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, -2.0, vcc
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fmul_select_f64_test11:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test11:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double -2.000000e+00, double -0.000000e+00
   %ldexp = fmul double %x, %y
@@ -1018,45 +1278,84 @@ define double @fmul_select_f64_test11(double %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define double @fmul_select_f64_test12(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test12:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 31, v2
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test12:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v2, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 31, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test12:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v3, 31, v3
-; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test12:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 31, v3
-; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test12:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 31, v2
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test12:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX7-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test12:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 31, v2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test12:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f64_test12:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v3
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 31, v3
+; GFX10-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f64_test12:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f64_test12:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v3
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 31, v3
+; GFX11-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f64_test12:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, 0, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 0.000000e+00, double -0.000000e+00
   %ldexp = fmul double %x, %y
@@ -1084,24 +1383,24 @@ define double @fmul_select_f64_test13(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f64_test13:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo
-; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test13:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f64_test13:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo
+; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f64_test13:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 0.000000e+00, double 1.600000e+01
   %ldexp = fmul double %x, %y
@@ -1109,44 +1408,83 @@ define double @fmul_select_f64_test13(double %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define double @fmul_select_f64_test14_sel_log2val_pos92_neg27(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_not_b32_e32 v4, 26
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x5c
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX7-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_not_b32_e32 v4, 26
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x5c
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v4, 0x5c
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v4, 0x5c
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_not_b32_e32 v4, 26
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v5, 0x5c
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX7-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5c
+; GFX7-GISEL-NEXT:    v_not_b32_e32 v5, 26
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_not_b32_e32 v4, 26
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x5c
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5c
+; GFX9-GISEL-NEXT:    v_not_b32_e32 v5, 26
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5c
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_not_b32_e32 v4, 26
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, v4, 0x5c, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5c
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_not_b32_e32 v4, 26
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, v4, 0x5c, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 0x45B0000000000000, double 0x3E40000000000000
   %ldexp = fmul double %x, %y
@@ -1154,44 +1492,83 @@ define double @fmul_select_f64_test14_sel_log2val_pos92_neg27(double %x, i32 %bo
 }
 
 define double @fmul_select_f64_test15_sel_log2val_neg42_neg33(double %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_not_b32_e32 v4, 32
-; GFX7-NEXT:    v_not_b32_e32 v5, 41
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX7-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_not_b32_e32 v4, 32
-; GFX9-NEXT:    v_not_b32_e32 v5, 41
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
-; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_not_b32_e32 v4, 41
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_not_b32_e32 v4, 41
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_not_b32_e32 v4, 32
+; GFX7-SDAG-NEXT:    v_not_b32_e32 v5, 41
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX7-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_not_b32_e32 v4, 41
+; GFX7-GISEL-NEXT:    v_not_b32_e32 v5, 32
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_not_b32_e32 v4, 32
+; GFX9-SDAG-NEXT:    v_not_b32_e32 v5, 41
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_not_b32_e32 v4, 41
+; GFX9-GISEL-NEXT:    v_not_b32_e32 v5, 32
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_not_b32_e32 v4, 41
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_not_b32_e32 v4, 32
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, v4, 0xffffffd6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_not_b32_e32 v4, 41
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_not_b32_e32 v4, 32
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, v4, 0xffffffd6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, double 0x3D50000000000000, double 0x3DE0000000000000
   %ldexp = fmul double %x, %y
@@ -1200,40 +1577,82 @@ define double @fmul_select_f64_test15_sel_log2val_neg42_neg33(double %x, i32 %bo
 
 
 define half @fmul_select_f16_test1(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test1:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test1:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test1:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test1:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test1:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test1:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test1:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test1:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test1:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 2.000000e+00, half 1.000000e+00
   %ldexp = fmul half %x, %y
@@ -1241,47 +1660,89 @@ define half @fmul_select_f16_test1(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define half @fmul_select_f16_test2(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test2:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX9-NEXT:    s_movk_i32 s4, 0x8000
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX9-NEXT:    v_med3_i32 v1, v1, s4, v2
-; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test2:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    s_movk_i32 s4, 0x8000
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX1030-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
-; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test2:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_movk_i32 s0, 0x8000
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
-; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test2:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test2:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test2:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX9-SDAG-NEXT:    v_med3_i32 v1, v1, s4, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test2:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test2:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-SDAG-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
+; GFX10-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test2:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test2:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test2:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 5.000000e-01, half 1.000000e+00
   %ldexp = fmul half %x, %y
@@ -1289,59 +1750,126 @@ define half @fmul_select_f16_test2(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f16_test3:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f16_test3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3c00
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x4000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f16_test3:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX1030-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX1030-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f16_test3:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f16_test3:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f16_test3:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f16_test3:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3c00
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x4000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f16_test3:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v3, v4
+; GFX9-GISEL-NEXT:    v_med3_i32 v2, v2, v3, v4
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v1, v0, v1
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2f16_test3:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2f16_test3:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v1, v0, v1
+; GFX10-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2f16_test3:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2f16_test3:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v2
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x half> <half 2.000000e+00, half 2.000000e+00>, <2 x half> <half 1.000000e+00, half 1.000000e+00>
   %ldexp = fmul <2 x half> %x, %y
@@ -1349,59 +1877,126 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1,
 }
 
 define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2f16_test4:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2f16_test4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3c00
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3800
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f16_test4:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x3800
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX1030-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX1030-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f16_test4:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x3800
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2f16_test4:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2f16_test4:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2f16_test4:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3c00
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x3800
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2f16_test4:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v3, v4
+; GFX9-GISEL-NEXT:    v_med3_i32 v2, v2, v3, v4
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v1, v0, v1
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2f16_test4:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3800
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2f16_test4:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v1, v0, v1
+; GFX10-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2f16_test4:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3800
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2f16_test4:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v2
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x half> <half 5.000000e-01, half 5.000000e-01>, <2 x half> <half 1.000000e+00, half 1.000000e+00>
   %ldexp = fmul <2 x half> %x, %y
@@ -1409,15 +2004,25 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1,
 }
 
 define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test5:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test5:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test5:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmul_select_f16_test5:
 ; GFX9:       ; %bb.0:
@@ -1427,22 +2032,22 @@ define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f16_test5:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test5:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f16_test5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX10-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f16_test5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 2.000000e+00, half 8.000000e+00
   %ldexp = fmul half %x, %y
@@ -1450,46 +2055,88 @@ define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test6:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x40400000
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc1000000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4200
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc800
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test6:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xc800
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
-; GFX1030-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test6:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v3, 0xc800
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
-; GFX1100-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test6:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x40400000
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc1000000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test6:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc800
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4200
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test6:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4200
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc800
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test6:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc800
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4200
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test6:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc800
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test6:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4200
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc800, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test6:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc800
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test6:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4200
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc800, vcc_lo
+; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half -8.000000e+00, half 3.000000e+00
   %ldexp = fmul half %x, %y
@@ -1497,45 +2144,87 @@ define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test7:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, -4.0, v3, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc400
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4800
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test7:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x4800
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
-; GFX1030-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test7:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v3, 0x4800
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
-; GFX1100-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test7:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, -4.0, v3, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test7:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4800
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc400
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test7:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc400
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4800
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test7:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4800
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc400
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test7:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4800
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test7:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc400
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x4800, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test7:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4800
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test7:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc400
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x4800, vcc_lo
+; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 8.000000e+00, half -4.000000e+00
   %ldexp = fmul half %x, %y
@@ -1543,16 +2232,28 @@ define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test8:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test8:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test8:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x8000
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmul_select_f16_test8:
 ; GFX9:       ; %bb.0:
@@ -1563,22 +2264,22 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1030-LABEL: fmul_select_f16_test8:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
-; GFX1030-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test8:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_f16_test8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_f16_test8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half -0.000000e+00, half 0.000000e+00
   %ldexp = fmul half %x, %y
@@ -1586,40 +2287,87 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test9:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e64 v0, -v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test9:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc
-; GFX9-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test9:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc_lo
-; GFX1030-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test9:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test9:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, -v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test9:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 5, v1
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test9:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test9:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 5, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test9:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc_lo
+; GFX10-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test9:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 5, v1
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test9:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test9:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 5, v1
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half -1.600000e+01, half -3.200000e+01
   %ldexp = fmul half %x, %y
@@ -1627,47 +2375,82 @@ define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 }
 
 define half @fmul_select_f16_test10_sel_log2val_neg11_pos11(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
-; GFX9-NEXT:    s_movk_i32 s4, 0x8000
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX9-NEXT:    v_med3_i32 v1, v1, s4, v2
-; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    s_movk_i32 s4, 0x8000
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
-; GFX1030-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
-; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_movk_i32 s0, 0x8000
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
-; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX9-SDAG-NEXT:    v_med3_i32 v1, v1, s4, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX10-SDAG-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
+; GFX10-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 0xH1000, half 0xH6800
   %ldexp = fmul half %x, %y
@@ -1675,47 +2458,82 @@ define half @fmul_select_f16_test10_sel_log2val_neg11_pos11(half %x, i32 %bool.a
 }
 
 define half @fmul_select_f16_test11_sel_log2val_pos7_neg14(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
-; GFX9-NEXT:    s_movk_i32 s4, 0x8000
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX9-NEXT:    v_med3_i32 v1, v1, s4, v2
-; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    s_movk_i32 s4, 0x8000
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
-; GFX1030-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
-; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_movk_i32 s0, 0x8000
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
-; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX9-SDAG-NEXT:    v_med3_i32 v1, v1, s4, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    s_movk_i32 s4, 0x8000
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX10-SDAG-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
+; GFX10-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 0xH5800, half 0xH0400
   %ldexp = fmul half %x, %y
@@ -1723,72 +2541,114 @@ define half @fmul_select_f16_test11_sel_log2val_pos7_neg14(half %x, i32 %bool.ar
 }
 
 define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test1:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3f80
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test1:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test1:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test1:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test1:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test1:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3f80
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test1:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test1:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test1:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test1:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 2.000000e+00, bfloat 1.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -1796,72 +2656,114 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test2:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3f80
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3f00
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test2:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x3f00
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test2:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test2:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test2:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test2:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3f80
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3f00
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test2:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test2:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3f00
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test2:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test2:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test2:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 5.000000e-01, bfloat 1.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -1869,111 +2771,158 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2bf16_test3:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2bf16_test3:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3f80
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x4000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2bf16_test3:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX1030-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX1030-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX1030-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1030-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1030-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX1030-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1030-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1030-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2bf16_test3:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX1100-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1100-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX1100-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1100-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1100-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2bf16_test3:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2bf16_test3:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2bf16_test3:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f80
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x4000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-SDAG-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2bf16_test3:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2bf16_test3:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2bf16_test3:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2bf16_test3:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2bf16_test3:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x bfloat> <bfloat 2.000000e+00, bfloat 2.000000e+00>, <2 x bfloat> <bfloat 1.000000e+00, bfloat 1.000000e+00>
   %ldexp = fmul <2 x bfloat> %x, %y
@@ -1981,111 +2930,158 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a
 }
 
 define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-LABEL: fmul_select_v2bf16_test4:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_v2bf16_test4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3f80
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3f00
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2bf16_test4:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x3f00
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX1030-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX1030-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX1030-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1030-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1030-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX1030-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1030-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1030-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2bf16_test4:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x3f00
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX1100-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1100-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX1100-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1100-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1100-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_v2bf16_test4:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_v2bf16_test4:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_v2bf16_test4:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f80
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x3f00
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-SDAG-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_v2bf16_test4:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_v2bf16_test4:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f00
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX10-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_v2bf16_test4:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_v2bf16_test4:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f00
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_v2bf16_test4:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x bfloat> <bfloat 5.000000e-01, bfloat 5.000000e-01>, <2 x bfloat> <bfloat 1.000000e+00, bfloat 1.000000e+00>
   %ldexp = fmul <2 x bfloat> %x, %y
@@ -2093,73 +3089,108 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a
 }
 
 define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test5:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, v3, 2.0, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test5:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4100
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test5:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test5:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test5:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v3, 2.0, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test5:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test5:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4100
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test5:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test5:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test5:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test5:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test5:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 2.000000e+00, bfloat 8.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2167,74 +3198,116 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test6:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x40400000
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc1000000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4040
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffc100
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test6:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xffffc100
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test6:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test6:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x40400000
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc1000000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test6:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc100
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4040
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test6:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4040
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffc100
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test6:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc100
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4040
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test6:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc100
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test6:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4040
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc100, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test6:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test6:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4040
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc100, vcc_lo
+; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat -8.000000e+00, bfloat 3.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2242,73 +3315,115 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test7:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, -4.0, v3, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test7:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffc080
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4100
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test7:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x4100
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test7:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test7:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, -4.0, v3, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test7:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4100
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc080
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test7:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc080
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4100
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test7:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4100
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc080
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test7:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4100
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test7:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc080
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x4100, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test7:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test7:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc080
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x4100, vcc_lo
+; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 8.000000e+00, bfloat -4.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2316,73 +3431,111 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test8:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test8:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, 15
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test8:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b16 v1, 15, v1
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test8:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshlrev_b16 v1, 15, v1
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test8:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test8:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x8000
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test8:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 15
+; GFX9-SDAG-NEXT:    v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test8:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x8000
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test8:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b16 v1, 15, v1
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test8:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test8:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 15, v1
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test8:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat -0.000000e+00, bfloat 0.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2390,74 +3543,121 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test9:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0xc2000000
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc1800000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test9:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffc200
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffc180
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test9:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xffffc180
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test9:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test9:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc2000000
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc1800000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test9:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 5, v1
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test9:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc200
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffc180
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test9:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 5, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test9:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc180
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test9:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 5, v1
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test9:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test9:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 5, v1
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat -1.600000e+01, bfloat -3.200000e+01
   %ldexp = fmul bfloat %x, %y
@@ -2465,74 +3665,111 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0xdb800000
-; GFX7-NEXT:    v_bfrev_b32_e32 v4, 7
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffdb80
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffe000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xffffe000
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0xdb800000
+; GFX7-SDAG-NEXT:    v_bfrev_b32_e32 v4, 7
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, 56, v3, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffdb80
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffe000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 56, v3, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffe000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 56, 0x41, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 56, 0x41, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 0xRE000, bfloat 0xRDB80
   %ldexp = fmul bfloat %x, %y
@@ -2540,74 +3777,111 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b
 }
 
 define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_bfrev_b32_e32 v3, 50
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0x34800000
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4c00
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3480
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x3480
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX7-SDAG:       ; %bb.0:
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT:    v_bfrev_b32_e32 v3, 50
+; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0x34800000
+; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX7-GISEL:       ; %bb.0:
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT:    v_not_b32_e32 v3, 21
+; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, 25, v3, vcc
+; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4c00
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3480
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_not_b32_e32 v3, 21
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 25, v3, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3480
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 25, 0xffffffea, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 25, 0xffffffea, vcc_lo
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 0xR3480, bfloat 0xR4C00
   %ldexp = fmul bfloat %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
index ebfb5e9ccaa35..a324ba35b155f 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
@@ -1625,14 +1625,12 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
 ; CODEGEN-IEEE-GISEL:       ; %bb.0:
 ; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4b800000
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 24, vcc
+; CODEGEN-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; CODEGEN-IEEE-GISEL-NEXT:    v_rsq_f32_e32 v0, v0
-; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x45800000
-; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 12, vcc
+; CODEGEN-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract:
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
index 104e157e9e15a..9ae60f99d5e09 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
@@ -3307,488 +3307,458 @@ define amdgpu_ps i32 @s_mul_32_f16(half inreg %x, half inreg %y) {
 ; --------------------------------------------------------------------
 
 define float @v_mul_f32_select_64_1(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_64_1:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f32_select_64_1:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f32_select_64_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_64_1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f32_select_64_1:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  %cond = icmp eq i32 %arg, 0
+  %select.pow2 = select i1 %cond, float 64.0, float 1.0
+  %mul = fmul float %x, %select.pow2
+  ret float %mul
+}
+
+define float @v_mul_f32_select_1_64(i32 %arg, float %x) {
+; GFX9-LABEL: v_mul_f32_select_1_64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_64_1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f32_select_1_64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  %cond = icmp eq i32 %arg, 0
+  %select.pow2 = select i1 %cond, float 1.0, float 64.0
+  %mul = fmul float %x, %select.pow2
+  ret float %mul
+}
+
+define float @v_mul_f32_select_n1_n64(i32 %arg, float %x) {
+; GFX9-LABEL: v_mul_f32_select_n1_n64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_64_1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f32_select_n1_n64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  %cond = icmp eq i32 %arg, 0
+  %select.pow2 = select i1 %cond, float -1.0, float -64.0
+  %mul = fmul float %x, %select.pow2
+  ret float %mul
+}
+
+define float @v_mul_f32_select_n64_n1(i32 %arg, float %x) {
+; GFX9-LABEL: v_mul_f32_select_n64_n1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_64_1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f32_select_n64_n1:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float 64.0, float 1.0
+  %select.pow2 = select i1 %cond, float -64.0, float -1.0
   %mul = fmul float %x, %select.pow2
   ret float %mul
 }
 
-define float @v_mul_f32_select_1_64(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_1_64:
+define float @v_mul_f32_select_128_64(i32 %arg, float %x) {
+; GFX9-SDAG-LABEL: v_mul_f32_select_128_64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc
 ; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f32_select_1_64:
+; GFX9-GISEL-LABEL: v_mul_f32_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 1.0, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_1_64:
+; GFX10-SDAG-LABEL: v_mul_f32_select_128_64:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
 ; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_1_64:
+; GFX10-GISEL-LABEL: v_mul_f32_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_1_64:
+; GFX11-SDAG-LABEL: v_mul_f32_select_128_64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
 ; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_1_64:
+; GFX11-GISEL-LABEL: v_mul_f32_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float 1.0, float 64.0
+  %select.pow2 = select i1 %cond, float 128.0, float 64.0
   %mul = fmul float %x, %select.pow2
   ret float %mul
 }
 
-define float @v_mul_f32_select_n1_n64(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_n1_n64:
+define float @v_mul_f32_select_n128_n64(i32 %arg, float %x) {
+; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc
 ; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f32_select_n1_n64:
+; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1.0, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_n1_n64:
+; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n64:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
 ; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_n1_n64:
+; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_n1_n64:
+; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
 ; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_n1_n64:
+; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float -1.0, float -64.0
+  %select.pow2 = select i1 %cond, float -128.0, float -64.0
   %mul = fmul float %x, %select.pow2
   ret float %mul
 }
 
-define float @v_mul_f32_select_n64_n1(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_n64_n1:
+define float @v_mul_f32_select_n128_n16(i32 %arg, float %x) {
+; GFX9-LABEL: v_mul_f32_select_n128_n16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1011-LABEL: v_mul_f32_select_n128_n16:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  %cond = icmp eq i32 %arg, 0
+  %select.pow2 = select i1 %cond, float -128.0, float -16.0
+  %mul = fmul float %x, %select.pow2
+  ret float %mul
+}
+
+define float @v_contract_mul_add_f32_select_64_1(i32 %arg, float %x, float %y) {
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f32_select_n64_n1:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1.0, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_n64_n1:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_n64_n1:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_n64_n1:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_n64_n1:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float -64.0, float -1.0
-  %mul = fmul float %x, %select.pow2
-  ret float %mul
+  %select.pow2 = select contract i1 %cond, float 64.0, float 1.0
+  %mul = fmul contract float %x, %select.pow2
+  %fma = fadd contract float %mul, %y
+  ret float %fma
 }
 
-define float @v_mul_f32_select_128_64(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_128_64:
+define float @v_contract_mul_add_f32_select_1_64(i32 %arg, float %x, float %y) {
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v3, 1.0, vcc
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f32_select_128_64:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x43000000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_128_64:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_128_64:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x43000000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_128_64:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_128_64:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x43000000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float 128.0, float 64.0
-  %mul = fmul float %x, %select.pow2
-  ret float %mul
+  %select.pow2 = select contract i1 %cond, float 1.0, float 64.0
+  %mul = fmul contract float %x, %select.pow2
+  %fma = fadd contract float %mul, %y
+  ret float %fma
 }
 
-define float @v_mul_f32_select_n128_n64(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n64:
+define float @v_contract_mul_add_f32_select_n64_n1(i32 %arg, float %x, float %y) {
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc2800000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, -1.0, v3, vcc
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n64:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc3000000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc2800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n64:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n64:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2800000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n64:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n64:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2800000
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float -128.0, float -64.0
-  %mul = fmul float %x, %select.pow2
-  ret float %mul
+  %select.pow2 = select contract i1 %cond, float -64.0, float -1.0
+  %mul = fmul contract float %x, %select.pow2
+  %fma = fadd contract float %mul, %y
+  ret float %fma
 }
 
-define float @v_mul_f32_select_n128_n16(i32 %arg, float %x) {
-; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n16:
+define float @v_contract_mul_add_f32_select_n1_n64(i32 %arg, float %x, float %y) {
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc2800000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v3, -1.0, vcc
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n16:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc3000000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n16:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n16:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc1800000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n16:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n16:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc1800000
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, -v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float -128.0, float -16.0
-  %mul = fmul float %x, %select.pow2
-  ret float %mul
-}
-
-define float @v_contract_mul_add_f32_select_64_1(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_64_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_64_1:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
-; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
-  %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select contract i1 %cond, float 64.0, float 1.0
-  %mul = fmul contract float %x, %select.pow2
-  %fma = fadd contract float %mul, %y
-  ret float %fma
-}
-
-define float @v_contract_mul_add_f32_select_1_64(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_1_64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, 1.0, vcc
-; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_1_64:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
-; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
-  %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select contract i1 %cond, float 1.0, float 64.0
-  %mul = fmul contract float %x, %select.pow2
-  %fma = fadd contract float %mul, %y
-  ret float %fma
-}
-
-define float @v_contract_mul_add_f32_select_n64_n1(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_n64_n1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc2800000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, -1.0, v3, vcc
-; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_n64_n1:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
-; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
-  %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select contract i1 %cond, float -64.0, float -1.0
-  %mul = fmul contract float %x, %select.pow2
-  %fma = fadd contract float %mul, %y
-  ret float %fma
-}
-
-define float @v_contract_mul_add_f32_select_n1_n64(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_n1_n64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc2800000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, -1.0, vcc
-; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_n1_n64:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
-; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, float -1.0, float -64.0
   %mul = fmul contract float %x, %select.pow2
@@ -3810,11 +3780,11 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y)
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x43000000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_128_64:
@@ -3829,10 +3799,11 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y)
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x43000000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_128_64:
@@ -3847,10 +3818,11 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y)
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x43000000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, float 128.0, float 64.0
@@ -3860,22 +3832,57 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y)
 }
 
 define float @v_contract_mul_add_f32_select_128_4(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_128_4:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x43000000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, 4.0, v3, vcc
-; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x43000000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, 4.0, v3, vcc
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_128_4:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 4.0, 0x43000000, vcc_lo
-; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4.0, 0x43000000, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4.0, 0x43000000, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_128_4:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, float 128.0, float 4.0
   %mul = fmul contract float %x, %select.pow2
@@ -3907,203 +3914,124 @@ define float @v_contract_mul_add_f32_select_2_4(i32 %arg, float %x, float %y) {
 }
 
 define float @v_contract_mul_add_f32_select_4_128(i32 %arg, float %x, float %y) {
-; GFX9-LABEL: v_contract_mul_add_f32_select_4_128:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x43000000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, 4.0, vcc
-; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1011-LABEL: v_contract_mul_add_f32_select_4_128:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0x43000000, 4.0, vcc_lo
-; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
-  %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, float 4.0, float 128.0
-  %mul = fmul contract float %x, %select.pow2
-  %fma = fadd contract float %mul, %y
-  ret float %fma
-}
-
-define double @v_mul_f64_select_64_1(i32 %arg, double %x) {
-; GFX9-SDAG-LABEL: v_mul_f64_select_64_1:
+; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x43000000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v3, 4.0, vcc
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_mul_f64_select_64_1:
+; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x40500000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3ff00000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_mul_f64_select_64_1:
+; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x43000000, 4.0, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_mul_f64_select_64_1:
+; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x40500000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f64_select_64_1:
+; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x43000000, 4.0, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f64_select_64_1:
+; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x3ff00000 :: v_dual_mov_b32 v3, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x40500000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, double 64.0, double 1.0
-  %mul = fmul double %x, %select.pow2
-  ret double %mul
+  %select.pow2 = select i1 %cond, float 4.0, float 128.0
+  %mul = fmul contract float %x, %select.pow2
+  %fma = fadd contract float %mul, %y
+  ret float %fma
 }
 
-define double @v_mul_f64_select_1_64(i32 %arg, double %x) {
-; GFX9-SDAG-LABEL: v_mul_f64_select_1_64:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f64_select_1_64:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40500000
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f64_select_1_64:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f64_select_1_64:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x40500000
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x3ff00000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f64_select_1_64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+define double @v_mul_f64_select_64_1(i32 %arg, double %x) {
+; GFX9-LABEL: v_mul_f64_select_64_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f64_select_1_64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x40500000 :: v_dual_mov_b32 v3, 0
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x3ff00000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f64_select_64_1:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
-  %select.pow2 = select i1 %cond, double 1.0, double 64.0
+  %select.pow2 = select i1 %cond, double 64.0, double 1.0
   %mul = fmul double %x, %select.pow2
   ret double %mul
-}
-
-define double @v_mul_f64_select_n1_n64(i32 %arg, double %x) {
-; GFX9-SDAG-LABEL: v_mul_f64_select_n1_n64:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f64_select_n1_n64:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbff00000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc0500000
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f64_select_n1_n64:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f64_select_n1_n64:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0500000
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xbff00000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+}
+
+define double @v_mul_f64_select_1_64(i32 %arg, double %x) {
+; GFX9-LABEL: v_mul_f64_select_1_64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f64_select_n1_n64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f64_select_1_64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+  %cond = icmp eq i32 %arg, 0
+  %select.pow2 = select i1 %cond, double 1.0, double 64.0
+  %mul = fmul double %x, %select.pow2
+  ret double %mul
+}
+
+define double @v_mul_f64_select_n1_n64(i32 %arg, double %x) {
+; GFX9-LABEL: v_mul_f64_select_n1_n64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f64_select_n1_n64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0xc0500000 :: v_dual_mov_b32 v3, 0
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xbff00000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f64_select_n1_n64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double -1.0, double -64.0
   %mul = fmul double %x, %select.pow2
@@ -4122,12 +4050,10 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) {
 ; GFX9-GISEL-LABEL: v_mul_f64_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x40600000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f64_select_128_64:
@@ -4141,11 +4067,10 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) {
 ; GFX10-GISEL-LABEL: v_mul_f64_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x40500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x40600000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f64_select_128_64:
@@ -4159,10 +4084,10 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) {
 ; GFX11-GISEL-LABEL: v_mul_f64_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x40500000 :: v_dual_mov_b32 v3, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x40600000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 128.0, double 64.0
@@ -4182,12 +4107,10 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) {
 ; GFX9-GISEL-LABEL: v_mul_f64_select_n128_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0600000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc0500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f64_select_n128_n64:
@@ -4201,11 +4124,10 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) {
 ; GFX10-GISEL-LABEL: v_mul_f64_select_n128_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f64_select_n128_n64:
@@ -4219,10 +4141,10 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) {
 ; GFX11-GISEL-LABEL: v_mul_f64_select_n128_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0xc0500000 :: v_dual_mov_b32 v3, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double -128.0, double -64.0
@@ -4231,59 +4153,21 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) {
 }
 
 define double @v_mul_f64_select_n128_n16(i32 %arg, double %x) {
-; GFX9-SDAG-LABEL: v_mul_f64_select_n128_n16:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f64_select_n128_n16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0600000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc0300000
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f64_select_n128_n16:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f64_select_n128_n16:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0300000
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f64_select_n128_n16:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f64_select_n128_n16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f64_select_n128_n16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0xc0300000 :: v_dual_mov_b32 v3, 0
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f64_select_n128_n16:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double -128.0, double -16.0
   %mul = fmul double %x, %select.pow2
@@ -4305,12 +4189,10 @@ define double @v_contract_mul_add_f64_select_64_1(i32 %arg, double %x, double %y
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40500000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_64_1:
@@ -4326,11 +4208,10 @@ define double @v_contract_mul_add_f64_select_64_1(i32 %arg, double %x, double %y
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40500000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_64_1:
@@ -4345,10 +4226,10 @@ define double @v_contract_mul_add_f64_select_64_1(i32 %arg, double %x, double %y
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x3ff00000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40500000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, double 64.0, double 1.0
@@ -4372,12 +4253,10 @@ define double @v_contract_mul_add_f64_select_1_64(i32 %arg, double %x, double %y
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_1_64:
@@ -4393,11 +4272,10 @@ define double @v_contract_mul_add_f64_select_1_64(i32 %arg, double %x, double %y
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x3ff00000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_1_64:
@@ -4412,10 +4290,10 @@ define double @v_contract_mul_add_f64_select_1_64(i32 %arg, double %x, double %y
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x40500000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x3ff00000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, double 1.0, double 64.0
@@ -4439,12 +4317,10 @@ define double @v_contract_mul_add_f64_select_n64_n1(i32 %arg, double %x, double
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_n64_n1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0xc0500000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0xbff00000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_n64_n1:
@@ -4460,11 +4336,10 @@ define double @v_contract_mul_add_f64_select_n64_n1(i32 %arg, double %x, double
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_n64_n1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0xbff00000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0xc0500000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_n64_n1:
@@ -4479,10 +4354,10 @@ define double @v_contract_mul_add_f64_select_n64_n1(i32 %arg, double %x, double
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_n64_n1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0xbff00000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0xc0500000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, double -64.0, double -1.0
@@ -4506,12 +4381,10 @@ define double @v_contract_mul_add_f64_select_n1_n64(i32 %arg, double %x, double
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0xbff00000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0xc0500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_n1_n64:
@@ -4527,11 +4400,10 @@ define double @v_contract_mul_add_f64_select_n1_n64(i32 %arg, double %x, double
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0xc0500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0xbff00000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_n1_n64:
@@ -4546,10 +4418,10 @@ define double @v_contract_mul_add_f64_select_n1_n64(i32 %arg, double %x, double
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0xc0500000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0xbff00000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, double -1.0, double -64.0
@@ -4573,12 +4445,11 @@ define double @v_contract_mul_add_f64_select_128_64(i32 %arg, double %x, double
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40600000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_128_64:
@@ -4594,11 +4465,11 @@ define double @v_contract_mul_add_f64_select_128_64(i32 %arg, double %x, double
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_128_64:
@@ -4613,10 +4484,11 @@ define double @v_contract_mul_add_f64_select_128_64(i32 %arg, double %x, double
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x40500000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 128.0, double 64.0
@@ -4640,12 +4512,10 @@ define double @v_contract_mul_add_f64_select_128_4(i32 %arg, double %x, double %
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_128_4:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40600000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40100000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_128_4:
@@ -4661,11 +4531,10 @@ define double @v_contract_mul_add_f64_select_128_4(i32 %arg, double %x, double %
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_128_4:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40100000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_128_4:
@@ -4680,10 +4549,10 @@ define double @v_contract_mul_add_f64_select_128_4(i32 %arg, double %x, double %
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_128_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x40100000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 128.0, double 4.0
@@ -4706,21 +4575,50 @@ define double @v_contract_mul_add_f64_select_2_4(i32 %arg, double %x, double %y)
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_2_4:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40100000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 2.0, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_contract_mul_add_f64_select_2_4:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_mov_b32_e32 v5, 0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v6, 0x40100000, 2.0, vcc_lo
-; GFX1011-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_2_4:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0x40100000, 2.0, vcc_lo
+; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_2_4:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_2_4:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0x40100000, 2.0, vcc_lo
+; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_2_4:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 2.0, double 4.0
   %mul = fmul contract double %x, %select.pow2
@@ -4743,12 +4641,10 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double %
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_4_128:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40100000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40600000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_4_128:
@@ -4764,11 +4660,10 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double %
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_4_128:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40600000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40100000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_4_128:
@@ -4783,10 +4678,10 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double %
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_4_128:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x40600000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40100000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[1:2], v0
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 4.0, double 128.0
@@ -4796,57 +4691,21 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double %
 }
 
 define half @v_mul_f16_select_64_1(i32 %arg, half %x) {
-; GFX9-SDAG-LABEL: v_mul_f16_select_64_1:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f16_select_64_1:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f16_select_64_1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f16_select_64_1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3c00
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x5400, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f16_select_64_1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f16_select_64_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f16_select_64_1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3c00
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x5400, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f16_select_64_1:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 64.0, half 1.0
   %mul = fmul half %x, %select.pow2
@@ -4854,57 +4713,21 @@ define half @v_mul_f16_select_64_1(i32 %arg, half %x) {
 }
 
 define half @v_mul_f16_select_1_64(i32 %arg, half %x) {
-; GFX9-SDAG-LABEL: v_mul_f16_select_1_64:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f16_select_1_64:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3c00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f16_select_1_64:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f16_select_1_64:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x3c00, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f16_select_1_64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f16_select_1_64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f16_select_1_64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x3c00, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f16_select_1_64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 1.0, half 64.0
   %mul = fmul half %x, %select.pow2
@@ -4912,57 +4735,21 @@ define half @v_mul_f16_select_1_64(i32 %arg, half %x) {
 }
 
 define half @v_mul_f16_select_n1_n64(i32 %arg, half %x) {
-; GFX9-SDAG-LABEL: v_mul_f16_select_n1_n64:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f16_select_n1_n64:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xbc00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f16_select_n1_n64:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f16_select_n1_n64:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd400
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xbc00, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f16_select_n1_n64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f16_select_n1_n64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f16_select_n1_n64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd400
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xbc00, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f16_select_n1_n64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half -1.0, half -64.0
   %mul = fmul half %x, %select.pow2
@@ -4981,11 +4768,13 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) {
 ; GFX9-GISEL-LABEL: v_mul_f16_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f16_select_128_64:
@@ -4999,10 +4788,12 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) {
 ; GFX10-GISEL-LABEL: v_mul_f16_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x5800, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f16_select_128_64:
@@ -5016,10 +4807,12 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) {
 ; GFX11-GISEL-LABEL: v_mul_f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x5800, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 128.0, half 64.0
@@ -5039,11 +4832,13 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
 ; GFX9-GISEL-LABEL: v_mul_f16_select_n128_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v2, v3
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f16_select_n128_n64:
@@ -5057,10 +4852,12 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
 ; GFX10-GISEL-LABEL: v_mul_f16_select_n128_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f16_select_n128_n64:
@@ -5074,10 +4871,12 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
 ; GFX11-GISEL-LABEL: v_mul_f16_select_n128_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half -128.0, half -64.0
@@ -5086,57 +4885,21 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
 }
 
 define half @v_mul_f16_select_n128_n16(i32 %arg, half %x) {
-; GFX9-SDAG-LABEL: v_mul_f16_select_n128_n16:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_mul_f16_select_n128_n16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcc00
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_mul_f16_select_n128_n16:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX10-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_mul_f16_select_n128_n16:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xcc00
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: v_mul_f16_select_n128_n16:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_f16_select_n128_n16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
+; GFX9-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_mul_f16_select_n128_n16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xcc00
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_f16_select_n128_n16:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX1011-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half -128.0, half -16.0
   %mul = fmul half %x, %select.pow2
@@ -5157,11 +4920,10 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_64_1:
@@ -5176,10 +4938,10 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_64_1:
@@ -5194,10 +4956,10 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half 64.0, half 1.0
@@ -5220,11 +4982,10 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_1_64:
@@ -5239,10 +5000,10 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_1_64:
@@ -5257,10 +5018,10 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half 1.0, half 64.0
@@ -5283,11 +5044,10 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_n64_n1:
@@ -5302,10 +5062,10 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd400, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n64_n1:
@@ -5320,10 +5080,10 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd400, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half -64.0, half -1.0
@@ -5346,11 +5106,10 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbc00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_n1_n64:
@@ -5365,10 +5124,10 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n1_n64:
@@ -5383,10 +5142,10 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half -1.0, half -64.0
@@ -5409,11 +5168,14 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v3, v4
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_128_64:
@@ -5428,10 +5190,13 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_64:
@@ -5446,10 +5211,13 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 128.0, half 64.0
@@ -5472,11 +5240,10 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_128_4:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_128_4:
@@ -5491,10 +5258,10 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_128_4:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_4:
@@ -5509,10 +5276,10 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_128_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 128.0, half 4.0
@@ -5535,11 +5302,14 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_2_4:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v3, v4
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_2_4:
@@ -5554,10 +5324,13 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_2_4:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x4000, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_2_4:
@@ -5572,10 +5345,13 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_2_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x4000, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 2.0, half 4.0
@@ -5598,11 +5374,10 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) {
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_4_128:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_4_128:
@@ -5617,10 +5392,10 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_4_128:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x4400, vcc_lo
-; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_4_128:
@@ -5635,10 +5410,10 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) {
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_4_128:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x4400, vcc_lo
-; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 4.0, half 128.0
@@ -5664,15 +5439,13 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_64_1:
@@ -5690,14 +5463,14 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x5400, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_64_1:
@@ -5715,14 +5488,15 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 64.0, half 64.0>, <2 x half> <half 1.0, half 1.0>
@@ -5747,15 +5521,13 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_1_64:
@@ -5773,14 +5545,14 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x3c00, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_1_64:
@@ -5798,14 +5570,15 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 1.0, half 1.0>, <2 x half> <half 64.0, half 64.0>
@@ -5830,15 +5603,14 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbc00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_n1_n64:
@@ -5856,14 +5628,15 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xbc00, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_n1_n64:
@@ -5881,14 +5654,16 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -1.0, half -1.0>, <2 x half> <half -64.0, half -64.0>
@@ -5913,15 +5688,19 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 6, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v3, v4
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v3, v4
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_128_64:
@@ -5939,14 +5718,19 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_128_64:
@@ -5964,14 +5748,20 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 64.0, half 64.0>
@@ -5996,15 +5786,20 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_n128_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 6, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v3, v4
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v3, v4
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_n128_n64:
@@ -6022,14 +5817,20 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_n128_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_n128_n64:
@@ -6047,14 +5848,21 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_n128_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -128.0, half -128.0>, <2 x half> <half -64.0, half -64.0>
@@ -6079,15 +5887,14 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_n128_n16:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xcc00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 4, 7, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_n128_n16:
@@ -6105,14 +5912,15 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_n128_n16:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcc00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 4, 7, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_n128_n16:
@@ -6130,14 +5938,16 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_n128_n16:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcc00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 4, 7, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -128.0, half -128.0>, <2 x half> <half -16.0, half -16.0>
@@ -6162,15 +5972,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3c00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_64_1:
@@ -6188,14 +5997,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5400, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5400, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_64_1:
@@ -6213,14 +6023,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 64.0, half 64.0>, <2 x half> <half 1.0, half 1.0>
@@ -6246,15 +6058,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_1_64:
@@ -6272,14 +6083,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x3c00, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x3c00, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_1_64:
@@ -6297,14 +6109,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 1.0, half 1.0>, <2 x half> <half 64.0, half 64.0>
@@ -6330,15 +6144,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xbc00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
@@ -6356,14 +6170,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0xd400, vcc_lo
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0xd400, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
@@ -6381,14 +6197,17 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0xd400, vcc_lo
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0xd400, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -64.0, half -64.0>, <2 x half> <half -1.0, half -1.0>
@@ -6414,15 +6233,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbc00
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
@@ -6440,14 +6259,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0xbc00, vcc_lo
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0xbc00, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
@@ -6465,14 +6286,17 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -1.0, half -1.0>, <2 x half> <half -64.0, half -64.0>
@@ -6498,15 +6322,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 6, v0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 6, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v4, v5
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v4, v5
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_64:
@@ -6524,14 +6353,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v4
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_64:
@@ -6549,14 +6384,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x7fff :: v_dual_add_nc_u32 v1, 6, v1
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v4
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v5, v1
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 64.0, half 64.0>
@@ -6582,15 +6423,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x4400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 2, 7, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_4:
@@ -6608,14 +6448,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 2, 7, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_4:
@@ -6633,14 +6474,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 2, 7, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 4.0, half 4.0>
@@ -6666,15 +6509,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x4400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 2, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xffff8000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7fff
+; GFX9-GISEL-NEXT:    v_med3_i32 v0, v0, v4, v5
+; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v4, v5
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_2_4:
@@ -6692,14 +6540,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x4000, vcc_lo
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 2, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v4
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x4000, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_2_4:
@@ -6717,14 +6571,20 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x4000, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x7fff :: v_dual_add_nc_u32 v1, 2, v1
+; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v4
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v5, v1
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x4000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 2.0, half 2.0>, <2 x half> <half 4.0, half 4.0>
@@ -6750,15 +6610,14 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x5800
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 7, 2, vcc
+; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_4_128:
@@ -6776,14 +6635,15 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x4400, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 7, 2, vcc_lo
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x4400, vcc_lo
+; GFX10-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_4_128:
@@ -6801,14 +6661,16 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x4400, vcc_lo
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 7, 2, vcc_lo
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x4400, vcc_lo
+; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 4.0, half 4.0>, <2 x half> <half 128.0, half 128.0>
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
index b3001819e9aaf..c1d5b5857b6b5 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
@@ -2380,14 +2380,12 @@ define float @v_sqrt_f32_ulp2_contractable_rcp(float %x) {
 ; GISEL-IEEE:       ; %bb.0:
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x4b800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 24, vcc
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x45800000
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 12, vcc
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_rcp:
@@ -2734,20 +2732,18 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_rcp(<2 x float> %x) {
 ; GISEL-IEEE:       ; %bb.0:
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x4b800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 24, vcc
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[4:5]
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, s[4:5]
 ; GISEL-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    v_rsq_f32_e32 v1, v1
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x45800000
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 12, vcc
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 12, s[4:5]
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_rcp:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index ac515808a0d8a..333d428c84bcc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -41,10 +41,10 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, s2, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    s_mov_b32 s2, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -78,9 +78,9 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, s2, v0
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, v0, v1
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
@@ -115,9 +115,9 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, s0, v0
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
@@ -203,7 +203,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x1f800000
+; SI-GISEL-NEXT:    v_not_b32_e32 v2, 63
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
@@ -213,10 +213,10 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, s7, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v3, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v3, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v3, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -252,7 +252,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x1f800000
+; VI-GISEL-NEXT:    v_not_b32_e32 v2, 63
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
@@ -262,10 +262,10 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, s7, v0
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v3, v3
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v1, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v3, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v3, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, v1, v2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -300,7 +300,7 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x1f800000
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v2, 63
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v0
@@ -310,10 +310,10 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, s11, v0
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v3, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v3, v0
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v3, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX900-GISEL-NEXT:    s_endpgm
@@ -421,17 +421,17 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x1f800000
+; SI-GISEL-NEXT:    v_not_b32_e32 v3, 63
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, s0, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v1
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v4
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v4
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_add_f32_e32 v4, s1, v4
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v1
@@ -439,11 +439,11 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v4, v4
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, s2, v1
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v2, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, v4, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v3, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v4, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
 ; SI-GISEL-NEXT:    s_endpgm
@@ -487,16 +487,16 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x1f800000
+; VI-GISEL-NEXT:    v_not_b32_e32 v3, 63
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, s0, v0
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v4
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v4
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, s1, v4
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v1
@@ -504,10 +504,10 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, s2, v1
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v4, v4
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v2, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, v4, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, v2, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, v4, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s5
 ; VI-GISEL-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
@@ -551,15 +551,15 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x1f800000
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v3, 63
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, s0, v0
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v1
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v4
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v4, s1, v4
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v1
@@ -567,10 +567,10 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v4, v4
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v2, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v3, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, v4, v1
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v4, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX900-GISEL-NEXT:    s_endpgm
@@ -710,7 +710,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x1f800000
+; SI-GISEL-NEXT:    v_not_b32_e32 v4, 63
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
@@ -720,22 +720,22 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, s9, v1
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 1.0, v4, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v5
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v5
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[0:1]
 ; SI-GISEL-NEXT:    v_add_f32_e32 v5, s10, v5
 ; SI-GISEL-NEXT:    v_add_f32_e32 v2, s11, v2
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v5, v5
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v3, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 1.0, v4, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, v5, v2
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, v3, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v3, v3, v4
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@@ -787,7 +787,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x1f800000
+; VI-GISEL-NEXT:    v_not_b32_e32 v4, 63
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
@@ -797,22 +797,22 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, s9, v1
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v5
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 1.0, v4, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, v1, v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v3, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, s10, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, s11, v2
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v5, v5
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v3, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 1.0, v4, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, v5, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, v5, v2
+; VI-GISEL-NEXT:    v_ldexp_f32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -863,7 +863,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x1f800000
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v4, 63
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
@@ -873,22 +873,22 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, s9, v1
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 1.0, v4, s[0:1]
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v1, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v5, s10, v5
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, s11, v2
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v5, v5
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v3, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 1.0, v4, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, v5, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v5, v2
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v3, v3, v4
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX900-GISEL-NEXT:    s_endpgm
@@ -1006,19 +1006,19 @@ define float @v_exp2_f32(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1034,6 +1034,20 @@ define float @v_exp2_f32(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1048,6 +1062,20 @@ define float @v_exp2_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1076,19 +1104,19 @@ define float @v_exp2_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_fabs_f32:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_fabs_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_fabs_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1104,6 +1132,20 @@ define float @v_exp2_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_fabs_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_fabs_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1118,6 +1160,20 @@ define float @v_exp2_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_fabs_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_fabs_f32:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1147,19 +1203,19 @@ define float @v_exp2_fneg_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_fneg_fabs_f32:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_sub_f32_e64 v0, v1, |v0|
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_fneg_fabs_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e64 v0, v1, |v0|
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_fneg_fabs_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1175,6 +1231,20 @@ define float @v_exp2_fneg_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_fneg_fabs_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e64 v0, v1, |v0|
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_fneg_fabs_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1189,6 +1259,20 @@ define float @v_exp2_fneg_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_fneg_fabs_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e64 v0, v1, |v0|
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_fneg_fabs_f32:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1219,19 +1303,19 @@ define float @v_exp2_fneg_f32(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_fneg_f32:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_fneg_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_fneg_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1247,6 +1331,20 @@ define float @v_exp2_fneg_f32(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_fneg_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_fneg_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1261,6 +1359,20 @@ define float @v_exp2_fneg_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_fneg_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_fneg_f32:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1290,19 +1402,19 @@ define float @v_exp2_f32_fast(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_fast:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_fast:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_fast:
 ; VI-SDAG:       ; %bb.0:
@@ -1318,6 +1430,20 @@ define float @v_exp2_f32_fast(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_fast:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1332,6 +1458,20 @@ define float @v_exp2_f32_fast(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_fast:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_fast:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1360,19 +1500,19 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_unsafe_math_attr:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_unsafe_math_attr:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_unsafe_math_attr:
 ; VI-SDAG:       ; %bb.0:
@@ -1388,6 +1528,20 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_unsafe_math_attr:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_unsafe_math_attr:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1402,6 +1556,20 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_unsafe_math_attr:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_unsafe_math_attr:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1430,19 +1598,19 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_approx_fn_attr:
 ; VI-SDAG:       ; %bb.0:
@@ -1458,6 +1626,20 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_approx_fn_attr:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1472,6 +1654,20 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_approx_fn_attr:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1500,19 +1696,19 @@ define float @v_exp2_f32_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_ninf:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_ninf:
 ; VI-SDAG:       ; %bb.0:
@@ -1528,6 +1724,20 @@ define float @v_exp2_f32_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_ninf:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1542,6 +1752,20 @@ define float @v_exp2_f32_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_ninf:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_ninf:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1570,19 +1794,19 @@ define float @v_exp2_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_afn:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_afn:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_afn:
 ; VI-SDAG:       ; %bb.0:
@@ -1598,6 +1822,20 @@ define float @v_exp2_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_afn:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1612,6 +1850,20 @@ define float @v_exp2_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_afn:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_afn:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1660,19 +1912,19 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_afn_dynamic:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_afn_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_afn_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -1688,6 +1940,20 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_afn_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_afn_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1702,6 +1968,20 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_afn_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_afn_dynamic:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1730,19 +2010,19 @@ define float @v_fabs_exp2_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_fabs_exp2_f32_afn:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_fabs_exp2_f32_afn:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_fabs_exp2_f32_afn:
 ; VI-SDAG:       ; %bb.0:
@@ -1758,6 +2038,20 @@ define float @v_fabs_exp2_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_fabs_exp2_f32_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_fabs_exp2_f32_afn:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1772,6 +2066,20 @@ define float @v_fabs_exp2_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_fabs_exp2_f32_afn:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_fabs_exp2_f32_afn:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1821,19 +2129,19 @@ define float @v_exp2_f32_nnan(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_nnan:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_nnan:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_nnan:
 ; VI-SDAG:       ; %bb.0:
@@ -1849,6 +2157,20 @@ define float @v_exp2_f32_nnan(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_nnan:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_nnan:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1863,6 +2185,20 @@ define float @v_exp2_f32_nnan(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_nnan:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_nnan:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -1911,19 +2247,19 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_nnan_dynamic:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_nnan_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_nnan_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -1939,6 +2275,20 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_nnan_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_nnan_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1953,6 +2303,20 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_nnan_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_nnan_dynamic:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -2001,19 +2365,19 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_ninf_dynamic:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_ninf_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_ninf_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -2029,6 +2393,20 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_ninf_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_ninf_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2043,6 +2421,20 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_ninf_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_ninf_dynamic:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -2071,19 +2463,19 @@ define float @v_exp2_f32_nnan_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_nnan_ninf:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_nnan_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
@@ -2099,6 +2491,20 @@ define float @v_exp2_f32_nnan_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_nnan_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_nnan_ninf:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2113,6 +2519,20 @@ define float @v_exp2_f32_nnan_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_nnan_ninf:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_nnan_ninf:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -2161,19 +2581,19 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_nnan_ninf_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -2189,6 +2609,20 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_nnan_ninf_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2203,6 +2637,20 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_nnan_ninf_dynamic:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -2251,19 +2699,19 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_dynamic_mode:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_dynamic_mode:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_dynamic_mode:
 ; VI-SDAG:       ; %bb.0:
@@ -2279,6 +2727,20 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_dynamic_mode:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_dynamic_mode:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2293,6 +2755,20 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_dynamic_mode:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_dynamic_mode:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -2313,20 +2789,50 @@ define float @v_exp2_f32_undef() {
 ; GCN-SDAG-NEXT:    v_exp_f32_e32 v0, 0x7fc00000
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_undef:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
-; GCN-GISEL-NEXT:    v_add_f32_e32 v1, s4, v1
-; GCN-GISEL-NEXT:    v_add_f32_e64 v2, s4, 0
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_undef:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, s4, v1
+; SI-GISEL-NEXT:    v_add_f32_e64 v2, s4, 0
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_exp2_f32_undef:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; VI-GISEL-NEXT:    v_add_f32_e32 v1, s4, v1
+; VI-GISEL-NEXT:    v_add_f32_e64 v2, s4, 0
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_exp2_f32_undef:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, s4, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e64 v2, s4, 0
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_undef:
 ; R600:       ; %bb.0:
@@ -3359,19 +3865,19 @@ define float @v_exp2_f32_contract(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_contract:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_contract:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_contract:
 ; VI-SDAG:       ; %bb.0:
@@ -3387,6 +3893,20 @@ define float @v_exp2_f32_contract(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_contract:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_contract:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3401,6 +3921,20 @@ define float @v_exp2_f32_contract(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_contract:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_contract:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -3449,19 +3983,19 @@ define float @v_exp2_f32_contract_nnan_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_f32_contract_nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
@@ -3477,6 +4011,20 @@ define float @v_exp2_f32_contract_nnan_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_exp2_f32_contract_nnan_ninf:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3491,6 +4039,20 @@ define float @v_exp2_f32_contract_nnan_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_not_b32_e32 v1, 63
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; R600-LABEL: v_exp2_f32_contract_nnan_ninf:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    CF_END
@@ -3518,3 +4080,5 @@ declare <3 x half> @llvm.exp2.v3f16(<3 x half>) #2
 attributes #0 = { "denormal-fp-math-f32"="ieee,preserve-sign" }
 attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" }
 attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN-GISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 218e41faa703d..b850428a03c05 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -45,16 +45,17 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    s_load_dword s0, s[4:5], 0xb
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
@@ -64,7 +65,6 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -104,25 +104,25 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; VI-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff000, v0
+; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v0, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3805fdf4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v3
+; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
@@ -162,25 +162,25 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; GFX900-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v3
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v2, -v5
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v5, v2
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x41b17218
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX900-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -218,24 +218,26 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s2, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x41b17218, s2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log_f32:
@@ -358,35 +360,36 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s6, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v2
-; SI-GISEL-NEXT:    v_fma_f32 v7, v2, v3, -v6
-; SI-GISEL-NEXT:    v_fma_f32 v7, v2, v4, v7
-; SI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v1
+; SI-GISEL-NEXT:    v_fma_f32 v6, v1, v2, -v5
+; SI-GISEL-NEXT:    v_fma_f32 v6, v1, v3, v6
+; SI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; SI-GISEL-NEXT:    v_log_f32_e32 v1, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s7, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v5, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v1
-; SI-GISEL-NEXT:    v_fma_f32 v3, v1, v3, -v2
-; SI-GISEL-NEXT:    v_fma_f32 v3, v1, v4, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v5
+; SI-GISEL-NEXT:    v_fma_f32 v2, v5, v2, -v1
+; SI-GISEL-NEXT:    v_fma_f32 v2, v5, v3, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
@@ -445,42 +448,43 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
-; VI-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; VI-GISEL-NEXT:    v_and_b32_e32 v4, 0xfffff000, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v2, v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s6, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v1
+; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v1, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3805fdf4, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v4
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v5
-; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317000, v4
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v3
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; VI-GISEL-NEXT:    v_log_f32_e32 v1, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s7, v0
+; VI-GISEL-NEXT:    v_log_f32_e32 v3, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41b17218
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; VI-GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff000, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v1, v2
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v3
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v3, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v5
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v7, v6
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v5
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
+; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -531,37 +535,38 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s10, v2
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v2, v3, -v6
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v2, v4, v7
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v1, v3, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v1, v4, v7
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s11, v0
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x41b17218
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v1, v3, -v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v1, v4, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[0:1]
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s11, v0
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v6, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v7, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v6, v3, -v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v6, v4, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -608,31 +613,37 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s3, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v2, 0x3f317217, v0 :: v_dual_mul_f32 v3, 0x3f317217, v1
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v3, 0x3f317217, v1 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s2, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_fma_f32 v5, 0x3f317217, v1, -v3
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v5, 0x3377d1cf, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_dual_add_f32 v3, v3, v5 :: v_dual_mul_f32 v2, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x41b17218, s5
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v4, 0x3f317217, v0, -v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v5, 0x3f317217, v1, -v3
-; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v4, 0x3377d1cf, v0 :: v_dual_fmac_f32 v5, 0x3377d1cf, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v4, 0x3377d1cf, v0
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x41b17218, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x41b17218, s5
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v3
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_mov_b32 v2, 0
 ; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
@@ -808,49 +819,51 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s8, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v6
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v4, v7
-; SI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v2, -v5
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v3, v6
+; SI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, s9, v6
-; SI-GISEL-NEXT:    v_log_f32_e32 v6, v6
-; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v5, s9, v5
+; SI-GISEL-NEXT:    v_log_f32_e32 v5, v5
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x41b17218
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v6
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v6, v3, -v8
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v6, v4, v9
-; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v7, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v2
-; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v6
-; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v4, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v3, v6, v3
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317217, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_fma_f32 v8, v5, v2, -v7
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_fma_f32 v8, v5, v3, v8
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s10, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; SI-GISEL-NEXT:    v_log_f32_e32 v8, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v5|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v5, v7, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v6, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v8
+; SI-GISEL-NEXT:    v_fma_f32 v2, v8, v2, -v5
+; SI-GISEL-NEXT:    v_fma_f32 v2, v8, v3, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v8|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
@@ -927,12 +940,13 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v0, v3
@@ -943,45 +957,46 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v2, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s9, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; VI-GISEL-NEXT:    v_ldexp_f32 v3, s9, v3
 ; VI-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x41b17218
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v3
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41b17218
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v3
+; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v3, v5
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v3, v6
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v7
-; VI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v7, v8, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v3|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, v6, s[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_log_f32_e32 v6, v1
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v3|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, v5, s[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v2, v3
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v6
+; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v6
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v6, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v7, v8, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v3
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v6
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v6|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s5
@@ -1046,49 +1061,51 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v2, -v6
 ; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v4, v7
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, s9, v6
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v6
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v6, s9, v6
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v6, v6
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v6
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v3, -v8
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v2, -v8
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v4, v9
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v9, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v7, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v6
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v4, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v9
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v9, v2, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v9, v4, v2
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v6, v2
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v9|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
 ; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -1156,49 +1173,55 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s3, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s1
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 0x41b17218, s6
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s6
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 0x41b17218, s3
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 0x41b17218, s6
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v5, 0x3f317217, v2 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_fma_f32 v8, 0x3f317217, v2, -v5
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v2
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v8, 0x3377d1cf, v2
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v5, v5, v8
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v6, 0x3f317217, v0, -v3
+; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v6, 0x3377d1cf, v0 :: v_dual_lshlrev_b32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 0x41b17218, s7
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v2
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v1
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v7, 0x3f317217, v1, -v4
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v6, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT:    v_fma_f32 v8, 0x3f317217, v2, -v5
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v7, 0x3377d1cf, v1
-; GFX1100-GISEL-NEXT:    v_add_f32_e32 v3, v3, v6
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 0x41b17218, s7
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v4, v4, v7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v8, 0x3377d1cf, v2 :: v_dual_mov_b32 v3, 0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v10
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_add_f32 v5, v5, v8 :: v_dual_sub_f32 v0, v0, v9
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v9 :: v_dual_sub_f32 v1, v1, v10
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v6
@@ -1433,62 +1456,65 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s8, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v5, v7
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v7
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v6
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v3, -v1
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v4, v6
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v6
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s9, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v1, v4, -v8
-; SI-GISEL-NEXT:    v_fma_f32 v9, v1, v5, v9
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x41b17218
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317217, v1
+; SI-GISEL-NEXT:    v_fma_f32 v8, v1, v3, -v7
+; SI-GISEL-NEXT:    v_fma_f32 v8, v1, v4, v8
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v9, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v9, s10, v9
-; SI-GISEL-NEXT:    v_log_f32_e32 v9, v9
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v7, s[0:1]
+; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 5, v8
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v8, s10, v8
+; SI-GISEL-NEXT:    v_log_f32_e32 v8, v8
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v9
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; SI-GISEL-NEXT:    v_fma_f32 v10, v9, v4, -v8
-; SI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; SI-GISEL-NEXT:    v_fma_f32 v10, v9, v5, v10
-; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v10
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v3
-; SI-GISEL-NEXT:    v_fma_f32 v4, v3, v4, -v8
-; SI-GISEL-NEXT:    v_fma_f32 v4, v3, v5, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v4, v8, v4
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317217, v8
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_fma_f32 v9, v8, v3, -v7
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-GISEL-NEXT:    v_fma_f32 v9, v8, v4, v9
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, s11, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v9
+; SI-GISEL-NEXT:    v_log_f32_e32 v9, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v8|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v8, v7, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317217, v9
+; SI-GISEL-NEXT:    v_fma_f32 v3, v9, v3, -v7
+; SI-GISEL-NEXT:    v_fma_f32 v3, v9, v4, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, v7, v3
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v9|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -1581,12 +1607,13 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v0, v1
@@ -1597,62 +1624,64 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v3
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s9, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x41b17218
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v1, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v7
-; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41b17218
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v1
+; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v1, v5
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v7, v8, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, s10, v7
-; VI-GISEL-NEXT:    v_log_f32_e32 v7, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v5, s[0:1]
-; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v7
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v6
+; VI-GISEL-NEXT:    v_ldexp_f32 v6, s10, v6
+; VI-GISEL-NEXT:    v_log_f32_e32 v6, v6
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v6
+; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v6, v5
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v8, v7, v6
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v8
-; VI-GISEL-NEXT:    v_mul_f32_e32 v10, 0x3805fdf4, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v9, v10, v9
-; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317000, v8
-; VI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v8
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v7|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v7, v6, s[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v3
-; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v3, v6
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v7
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, s11, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v7
+; VI-GISEL-NEXT:    v_log_f32_e32 v7, v2
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v5, s[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v7
+; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v7, v5
+; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v5, s[0:1]
+; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v8
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1730,61 +1759,64 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v4, -v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v5, v7
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v7
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s9, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v4, -v8
+; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v3, -v8
 ; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v5, v9
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v9, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v9, s10, v9
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 5, v9
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v9, s10, v9
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v9, v9
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v7, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v9
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v4, -v8
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v3, -v8
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v5, v10
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, s11, v2
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v10
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v10, v2
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v8
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v3, v4, -v8
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v3, v5, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v4, v8, v4
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v6
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v10
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v10, v3, -v8
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v10, v5, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v3, v8, v3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v10|, v6
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v7, s[0:1]
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v5
 ; GFX900-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -1860,60 +1892,67 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s8, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s9, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s0
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s8
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s9
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s6
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x41b17218, s6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(TRANS32_DEP_3)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 0x41b17218, s7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 0x41b17218, s8
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 0x41b17218, s9
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v3, s3, v3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317217, v2
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v3
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_fma_f32 v12, 0x3f317217, v2, -v7
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    v_fma_f32 v13, 0x3f317217, v3, -v8
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v12, 0x3377d1cf, v2 :: v_dual_fmac_f32 v13, 0x3377d1cf, v3
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v7, v7, v12
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_add_f32 v8, v8, v13
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v10, 0x3f317217, v0, -v5
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v11, 0x3f317217, v1, -v6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v12, 0x3f317217, v2, -v7
-; GFX1100-GISEL-NEXT:    v_fma_f32 v13, 0x3f317217, v3, -v8
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v10, 0x3377d1cf, v0 :: v_dual_fmac_f32 v11, 0x3377d1cf, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v12, 0x3377d1cf, v2 :: v_dual_fmac_f32 v13, 0x3377d1cf, v3
 ; GFX1100-GISEL-NEXT:    v_dual_add_f32 v5, v5, v10 :: v_dual_add_f32 v6, v6, v11
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_dual_add_f32 v7, v7, v12 :: v_dual_add_f32 v8, v8, v13
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_sub_f32 v1, v1, v9
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_sub_f32 v0, v0, v4
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3|
-; GFX1100-GISEL-NEXT:    v_dual_cndmask_b32 v3, v3, v8 :: v_dual_sub_f32 v2, v2, v14
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v1, v1, v9 :: v_dual_sub_f32 v2, v2, v14
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v15
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log_v4f32:
@@ -2126,10 +2165,10 @@ define float @v_log_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2175,16 +2214,16 @@ define float @v_log_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -2224,10 +2263,10 @@ define float @v_log_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2270,21 +2309,22 @@ define float @v_log_f32(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2329,10 +2369,10 @@ define float @v_log_fabs_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2378,16 +2418,16 @@ define float @v_log_fabs_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -2427,10 +2467,10 @@ define float @v_log_fabs_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2475,20 +2515,22 @@ define float @v_log_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2534,10 +2576,10 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, -|v0|, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2583,16 +2625,16 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -2632,10 +2674,10 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2680,20 +2722,22 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -|v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2740,10 +2784,10 @@ define float @v_log_fneg_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, -v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2789,16 +2833,16 @@ define float @v_log_fneg_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -2838,10 +2882,10 @@ define float @v_log_fneg_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -2885,20 +2929,22 @@ define float @v_log_fneg_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3304,10 +3350,10 @@ define float @v_log_f32_ninf(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -3353,16 +3399,16 @@ define float @v_log_f32_ninf(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -3402,10 +3448,10 @@ define float @v_log_f32_ninf(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -3448,21 +3494,22 @@ define float @v_log_f32_ninf(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4038,10 +4085,10 @@ define float @v_log_f32_nnan(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4087,16 +4134,16 @@ define float @v_log_f32_nnan(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -4136,10 +4183,10 @@ define float @v_log_f32_nnan(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4182,21 +4229,22 @@ define float @v_log_f32_nnan(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4381,10 +4429,10 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4430,16 +4478,16 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -4479,10 +4527,10 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4525,21 +4573,22 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4724,10 +4773,10 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4773,16 +4822,16 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -4822,10 +4871,10 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4868,21 +4917,22 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4924,10 +4974,10 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -4967,16 +5017,16 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3805fdf4, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v0
+; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317000, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -5010,10 +5060,10 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5051,18 +5101,20 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5207,10 +5259,10 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5250,16 +5302,16 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3805fdf4, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v0
+; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317000, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -5293,10 +5345,10 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5334,18 +5386,20 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5419,10 +5473,10 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5468,16 +5522,16 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3805fdf4, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
@@ -5517,10 +5571,10 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5563,21 +5617,22 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index fd50d1b60fbd1..d09df75837339 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -45,16 +45,17 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    s_load_dword s0, s[4:5], 0xb
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
@@ -64,7 +65,6 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -104,25 +104,25 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; VI-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff000, v0
+; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v0, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x369a84fb, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v3
+; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
@@ -162,25 +162,25 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; GFX900-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v3
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v2, -v5
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v5, v2
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x411a209b
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX900-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -218,24 +218,26 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s2, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x411a209b, s2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log10_f32:
@@ -358,35 +360,36 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s6, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v2
-; SI-GISEL-NEXT:    v_fma_f32 v7, v2, v3, -v6
-; SI-GISEL-NEXT:    v_fma_f32 v7, v2, v4, v7
-; SI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v1
+; SI-GISEL-NEXT:    v_fma_f32 v6, v1, v2, -v5
+; SI-GISEL-NEXT:    v_fma_f32 v6, v1, v3, v6
+; SI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; SI-GISEL-NEXT:    v_log_f32_e32 v1, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s7, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v5, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v1
-; SI-GISEL-NEXT:    v_fma_f32 v3, v1, v3, -v2
-; SI-GISEL-NEXT:    v_fma_f32 v3, v1, v4, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v5
+; SI-GISEL-NEXT:    v_fma_f32 v2, v5, v2, -v1
+; SI-GISEL-NEXT:    v_fma_f32 v2, v5, v3, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v5|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
@@ -445,42 +448,43 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
-; VI-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; VI-GISEL-NEXT:    v_and_b32_e32 v4, 0xfffff000, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v2, v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s6, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v1
+; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v1, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x369a84fb, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v4
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v5
-; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a2000, v4
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v3
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; VI-GISEL-NEXT:    v_log_f32_e32 v1, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s7, v0
+; VI-GISEL-NEXT:    v_log_f32_e32 v3, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x411a209b
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; VI-GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff000, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v1, v2
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v3
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v3, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v5
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v7, v6
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v5
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
+; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -531,37 +535,38 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s10, v2
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v2, v3, -v6
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v2, v4, v7
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v1, v3, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v1, v4, v7
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s11, v0
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x411a209b
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v1, v3, -v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v1, v4, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[0:1]
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s11, v0
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v6, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v7, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v6, v3, -v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v6, v4, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v6|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v7, s[0:1]
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -608,31 +613,37 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s3, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v2, 0x3e9a209a, v0 :: v_dual_mul_f32 v3, 0x3e9a209a, v1
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v3, 0x3e9a209a, v1 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s2, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_fma_f32 v5, 0x3e9a209a, v1, -v3
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v5, 0x3284fbcf, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_dual_add_f32 v3, v3, v5 :: v_dual_mul_f32 v2, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x411a209b, s5
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v4, 0x3e9a209a, v0, -v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v5, 0x3e9a209a, v1, -v3
-; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v4, 0x3284fbcf, v0 :: v_dual_fmac_f32 v5, 0x3284fbcf, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v4, 0x3284fbcf, v0
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x411a209b, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x411a209b, s5
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v3
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_mov_b32 v2, 0
 ; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
 ; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
@@ -808,49 +819,51 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s8, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v6
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v4, v7
-; SI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v2, -v5
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v3, v6
+; SI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, s9, v6
-; SI-GISEL-NEXT:    v_log_f32_e32 v6, v6
-; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v5, s9, v5
+; SI-GISEL-NEXT:    v_log_f32_e32 v5, v5
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x411a209b
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v6
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v6, v3, -v8
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v6, v4, v9
-; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v7, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v2
-; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v6
-; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v4, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v3, v6, v3
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a209a, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_fma_f32 v8, v5, v2, -v7
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_fma_f32 v8, v5, v3, v8
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s10, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; SI-GISEL-NEXT:    v_log_f32_e32 v8, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v5|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v5, v7, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v6, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v8
+; SI-GISEL-NEXT:    v_fma_f32 v2, v8, v2, -v5
+; SI-GISEL-NEXT:    v_fma_f32 v2, v8, v3, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, v5, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v8|, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
@@ -927,12 +940,13 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v0, v3
@@ -943,45 +957,46 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v2, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s9, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; VI-GISEL-NEXT:    v_ldexp_f32 v3, s9, v3
 ; VI-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x411a209b
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v3
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x411a209b
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v3
+; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v3, v5
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v3, v6
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v7
-; VI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v7, v8, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v3|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, v6, s[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[0:1]
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_log_f32_e32 v6, v1
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v3|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, v5, s[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v2, v3
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v6
+; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v6
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v6, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v7, v8, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v3
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v6
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v6|, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s5
@@ -1046,49 +1061,51 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v2, -v6
 ; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v4, v7
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, s9, v6
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v6
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v6, s9, v6
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v6, v6
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v6
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v3, -v8
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v2, -v8
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v4, v9
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v9, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v7, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v6
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v4, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v3, v6, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v9
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v9, v2, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v9, v4, v2
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v6, v2
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v9|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
 ; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -1156,49 +1173,55 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s3, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s1
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 0x411a209b, s6
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s6
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 0x411a209b, s3
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 0x411a209b, s6
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v5, 0x3e9a209a, v2 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_fma_f32 v8, 0x3e9a209a, v2, -v5
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v2
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v8, 0x3284fbcf, v2
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v5, v5, v8
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v6, 0x3e9a209a, v0, -v3
+; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v6, 0x3284fbcf, v0 :: v_dual_lshlrev_b32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 0x411a209b, s7
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v2
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v1
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v7, 0x3e9a209a, v1, -v4
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v6, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT:    v_fma_f32 v8, 0x3e9a209a, v2, -v5
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v7, 0x3284fbcf, v1
-; GFX1100-GISEL-NEXT:    v_add_f32_e32 v3, v3, v6
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 0x411a209b, s7
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v4, v4, v7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v8, 0x3284fbcf, v2 :: v_dual_mov_b32 v3, 0
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v10
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_add_f32 v5, v5, v8 :: v_dual_sub_f32 v0, v0, v9
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v9 :: v_dual_sub_f32 v1, v1, v10
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v6
@@ -1433,62 +1456,65 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s8, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v5, v7
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v7
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v6
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v3, -v1
+; SI-GISEL-NEXT:    v_fma_f32 v6, v0, v4, v6
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v6
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s9, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v1, v4, -v8
-; SI-GISEL-NEXT:    v_fma_f32 v9, v1, v5, v9
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x411a209b
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a209a, v1
+; SI-GISEL-NEXT:    v_fma_f32 v8, v1, v3, -v7
+; SI-GISEL-NEXT:    v_fma_f32 v8, v1, v4, v8
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v9, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v9, s10, v9
-; SI-GISEL-NEXT:    v_log_f32_e32 v9, v9
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v7, s[0:1]
+; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 5, v8
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v8, s10, v8
+; SI-GISEL-NEXT:    v_log_f32_e32 v8, v8
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v9
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; SI-GISEL-NEXT:    v_fma_f32 v10, v9, v4, -v8
-; SI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; SI-GISEL-NEXT:    v_fma_f32 v10, v9, v5, v10
-; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v10
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v3
-; SI-GISEL-NEXT:    v_fma_f32 v4, v3, v4, -v8
-; SI-GISEL-NEXT:    v_fma_f32 v4, v3, v5, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v4, v8, v4
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v6
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a209a, v8
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_fma_f32 v9, v8, v3, -v7
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-GISEL-NEXT:    v_fma_f32 v9, v8, v4, v9
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, s11, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v9
+; SI-GISEL-NEXT:    v_log_f32_e32 v9, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v8|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v8, v7, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v7
+; SI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a209a, v9
+; SI-GISEL-NEXT:    v_fma_f32 v3, v9, v3, -v7
+; SI-GISEL-NEXT:    v_fma_f32 v3, v9, v4, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, v7, v3
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v9|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -1581,12 +1607,13 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v0, v1
@@ -1597,62 +1624,64 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v3
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s9, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x411a209b
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v1, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v7
-; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x411a209b
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v1
+; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v1, v5
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v7, v8, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, s10, v7
-; VI-GISEL-NEXT:    v_log_f32_e32 v7, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v5, s[0:1]
-; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v7
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v6
+; VI-GISEL-NEXT:    v_ldexp_f32 v6, s10, v6
+; VI-GISEL-NEXT:    v_log_f32_e32 v6, v6
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v6
+; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v6, v5
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v8, v7, v6
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v8
-; VI-GISEL-NEXT:    v_mul_f32_e32 v10, 0x369a84fb, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v9, v10, v9
-; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a2000, v8
-; VI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; VI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v8
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v7|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v7, v6, s[2:3]
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v6
-; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v3
-; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v3, v6
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v7
-; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v5
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v7
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, s11, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v7
+; VI-GISEL-NEXT:    v_log_f32_e32 v7, v2
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v5, s[2:3]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v5
+; VI-GISEL-NEXT:    v_and_b32_e32 v5, 0xfffff000, v7
+; VI-GISEL-NEXT:    v_sub_f32_e32 v6, v7, v5
+; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v6
+; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v5, s[0:1]
+; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v8
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v7|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
@@ -1730,61 +1759,64 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v4, -v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v5, v7
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v7
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s9, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v4, -v8
+; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v3, -v8
 ; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v5, v9
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v9, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v9, s10, v9
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 5, v9
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v9, s10, v9
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v9, v9
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v7, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v9
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v4, -v8
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v3, -v8
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
 ; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v5, v10
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, s11, v2
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v10
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v10, v2
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v8
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v3, v4, -v8
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v3, v5, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v4, v8, v4
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v6
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v10
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v10, v3, -v8
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v10, v5, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v3, v8, v3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v10|, v6
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v7, s[0:1]
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v5
 ; GFX900-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -1860,60 +1892,67 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s8, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s9, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s0
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s8
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s9
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s6
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x411a209b, s6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(TRANS32_DEP_3)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 0x411a209b, s7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 0x411a209b, s8
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 0x411a209b, s9
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v3, s3, v3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a209a, v2
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v3
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_fma_f32 v12, 0x3e9a209a, v2, -v7
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    v_fma_f32 v13, 0x3e9a209a, v3, -v8
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v12, 0x3284fbcf, v2 :: v_dual_fmac_f32 v13, 0x3284fbcf, v3
+; GFX1100-GISEL-NEXT:    v_add_f32_e32 v7, v7, v12
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3
+; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_add_f32 v8, v8, v13
+; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v10, 0x3e9a209a, v0, -v5
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v11, 0x3e9a209a, v1, -v6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v12, 0x3e9a209a, v2, -v7
-; GFX1100-GISEL-NEXT:    v_fma_f32 v13, 0x3e9a209a, v3, -v8
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v10, 0x3284fbcf, v0 :: v_dual_fmac_f32 v11, 0x3284fbcf, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v12, 0x3284fbcf, v2 :: v_dual_fmac_f32 v13, 0x3284fbcf, v3
 ; GFX1100-GISEL-NEXT:    v_dual_add_f32 v5, v5, v10 :: v_dual_add_f32 v6, v6, v11
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_dual_add_f32 v7, v7, v12 :: v_dual_add_f32 v8, v8, v13
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v2|
-; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_sub_f32 v1, v1, v9
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_sub_f32 v0, v0, v4
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v3|
-; GFX1100-GISEL-NEXT:    v_dual_cndmask_b32 v3, v3, v8 :: v_dual_sub_f32 v2, v2, v14
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc_lo
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v1, v1, v9 :: v_dual_sub_f32 v2, v2, v14
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v15
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log10_v4f32:
@@ -2126,10 +2165,10 @@ define float @v_log10_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2175,16 +2214,16 @@ define float @v_log10_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -2224,10 +2263,10 @@ define float @v_log10_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2270,21 +2309,22 @@ define float @v_log10_f32(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2329,10 +2369,10 @@ define float @v_log10_fabs_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2378,16 +2418,16 @@ define float @v_log10_fabs_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -2427,10 +2467,10 @@ define float @v_log10_fabs_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2475,20 +2515,22 @@ define float @v_log10_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2534,10 +2576,10 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, -|v0|, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2583,16 +2625,16 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -2632,10 +2674,10 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2680,20 +2722,22 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -|v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2740,10 +2784,10 @@ define float @v_log10_fneg_f32(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, -v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2789,16 +2833,16 @@ define float @v_log10_fneg_f32(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -2838,10 +2882,10 @@ define float @v_log10_fneg_f32(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -2885,20 +2929,22 @@ define float @v_log10_fneg_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3304,10 +3350,10 @@ define float @v_log10_f32_ninf(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -3353,16 +3399,16 @@ define float @v_log10_f32_ninf(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -3402,10 +3448,10 @@ define float @v_log10_f32_ninf(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -3448,21 +3494,22 @@ define float @v_log10_f32_ninf(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4038,10 +4085,10 @@ define float @v_log10_f32_nnan(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4087,16 +4134,16 @@ define float @v_log10_f32_nnan(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -4136,10 +4183,10 @@ define float @v_log10_f32_nnan(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4182,21 +4229,22 @@ define float @v_log10_f32_nnan(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4381,10 +4429,10 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4430,16 +4478,16 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -4479,10 +4527,10 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4525,21 +4573,22 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4724,10 +4773,10 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4773,16 +4822,16 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -4822,10 +4871,10 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4868,21 +4917,22 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4924,10 +4974,10 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -4967,16 +5017,16 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x369a84fb, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v0
+; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a2000, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -5010,10 +5060,10 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5051,18 +5101,20 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5207,10 +5259,10 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5250,16 +5302,16 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x369a84fb, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
+; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v0
+; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a2000, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -5293,10 +5345,10 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5334,18 +5386,20 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5419,10 +5473,10 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5468,16 +5522,16 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x369a84fb, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
@@ -5517,10 +5571,10 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5563,21 +5617,22 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 2c5a9f58a199e..8b3b79b0b1bdd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -36,14 +36,14 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    s_load_dword s2, s[4:5], 0xb
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s2, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -74,13 +74,13 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
 ; VI-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s2, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -108,20 +108,19 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
 ;
 ; GFX900-GISEL-LABEL: s_log2_f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s2, v0
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX900-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -147,20 +146,22 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
 ;
 ; GFX1100-GISEL-LABEL: s_log2_f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
+; GFX1100-GISEL-NEXT:    s_clause 0x1
+; GFX1100-GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s2, 0x800000, s0
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s3, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s2
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s3
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s2, v0
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
-; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log2_f32:
@@ -242,21 +243,22 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, s6, v3
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; SI-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; SI-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v3, v0
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, s6, v2
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s7, v0
+; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_log_f32_e32 v3, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v1, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v3, v1
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -291,21 +293,22 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s6, v3
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; VI-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; VI-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v3, v0
-; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, s6, v2
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s7, v0
+; VI-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; VI-GISEL-NEXT:    v_log_f32_e32 v3, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v1, s[0:1]
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
+; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v3, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -339,22 +342,23 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v0
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, s10, v3
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s11, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v3, s10, v3
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s11, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v4, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v1, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v3, v0
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v4, v1
 ; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -387,23 +391,28 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX1100-GISEL-LABEL: s_log2_v2f32:
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s5, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s4
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42000000, s5
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42000000, s4
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s3, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
-; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX1100-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s2, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX1100-GISEL-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log2_v2f32:
@@ -506,32 +515,34 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; SI-GISEL-LABEL: s_log2_v3f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
+; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v1
-; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v3, s9, v3
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s10, v1
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mul_f32_e32 v4, s1, v4
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
-; SI-GISEL-NEXT:    v_log_f32_e32 v4, v4
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s2, v1
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v4, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; SI-GISEL-NEXT:    v_log_f32_e32 v4, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v3, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
-; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v4, v2
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
 ; SI-GISEL-NEXT:    s_endpgm
@@ -571,32 +582,34 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; VI-GISEL-LABEL: s_log2_v3f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; VI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
+; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
-; VI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v1
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 5, v3
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v3, s9, v3
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s10, v1
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v4, s1, v4
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s2, v1
-; VI-GISEL-NEXT:    v_log_f32_e32 v4, v4
-; VI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v4, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s5
+; VI-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; VI-GISEL-NEXT:    v_log_f32_e32 v4, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v3, v1
+; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s3
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-GISEL-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
 ; VI-GISEL-NEXT:    s_endpgm
 ;
@@ -637,28 +650,30 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s0, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s1, v1
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, s1, v4
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v4, s1, v4
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s2, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s2, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v4, v4
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v5, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v4, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v5, v2
 ; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -702,33 +717,40 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; GFX1100-GISEL-LABEL: s_log2_v3f32:
 ; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX1100-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s3, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s1
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s3, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s6
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s7
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x42000000, s6
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 0x42000000, s3
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x42000000, s7
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s1, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v3 :: v_dual_mov_b32 v3, 0
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v4
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v5
-; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
+; GFX1100-GISEL-NEXT:    global_store_b96 v6, v[0:2], s[4:5]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log2_v3f32:
@@ -865,34 +887,37 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42000000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
+; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, s9, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v3, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v5, s10, v5
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; SI-GISEL-NEXT:    v_log_f32_e32 v5, v5
-; SI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v5, v2
-; SI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v4, s10, v4
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, s11, v2
+; SI-GISEL-NEXT:    v_log_f32_e32 v4, v4
+; SI-GISEL-NEXT:    v_log_f32_e32 v5, v2
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v4, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v3, v5, v3
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
@@ -942,33 +967,36 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42000000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v1, s9, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v5, s10, v5
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; VI-GISEL-NEXT:    v_log_f32_e32 v5, v5
-; VI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
-; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v5, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
+; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v4
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 5, v4
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; VI-GISEL-NEXT:    v_ldexp_f32 v4, s10, v4
+; VI-GISEL-NEXT:    v_ldexp_f32 v2, s11, v2
+; VI-GISEL-NEXT:    v_log_f32_e32 v4, v4
+; VI-GISEL-NEXT:    v_log_f32_e32 v5, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v4, v2
+; VI-GISEL-NEXT:    v_sub_f32_e32 v3, v5, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -1018,34 +1046,37 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, s8, v0
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, s9, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s10, v5
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 5, v5
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 5, v2
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v5, s10, v5
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, s11, v2
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v5, v5
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v6, v2
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v5, v2
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v6, v3
 ; GFX900-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -1095,39 +1126,46 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ;
 ; GFX1100-GISEL-LABEL: s_log2_v4f32:
 ; GFX1100-GISEL:       ; %bb.0:
+; GFX1100-GISEL-NEXT:    s_clause 0x1
 ; GFX1100-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX1100-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s7, 0x800000, s1
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s8, 0x800000, s2
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s6, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s9, 0x800000, s3
+; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s8, 0x800000, s2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s6
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s6
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x42000000, s7
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s9
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x42000000, s6
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v0, s0, v0 :: v_dual_mul_f32 v1, s1, v1
-; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3
-; GFX1100-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s8
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 0x42000000, s9
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 0x42000000, s8
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v1, s1, v1
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v1, v1, v5 :: v_dual_lshlrev_b32 v0, 5, v0
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, s0, v0
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_lshlrev_b32 v3, 5, v3
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v3, s3, v3
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v3, v3
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 0x42000000, s7
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 0x42000000, s8
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 0x42000000, s9
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7
-; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1100-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT:    v_dual_sub_f32 v3, v3, v7 :: v_dual_lshlrev_b32 v2, 5, v2
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v2, s2, v2
+; GFX1100-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX1100-GISEL-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
 ; GFX1100-GISEL-NEXT:    s_endpgm
 ;
 ; R600-LABEL: s_log2_v4f32:
@@ -1243,19 +1281,19 @@ define float @v_log2_f32(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1271,6 +1309,20 @@ define float @v_log2_f32(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1285,6 +1337,20 @@ define float @v_log2_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1304,10 +1370,12 @@ define float @v_log2_f32(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1341,19 +1409,19 @@ define float @v_log2_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_fabs_f32:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_fabs_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_fabs_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1369,6 +1437,20 @@ define float @v_log2_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_fabs_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_fabs_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1383,6 +1465,20 @@ define float @v_log2_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_fabs_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_fabs_f32:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1403,10 +1499,11 @@ define float @v_log2_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1441,19 +1538,19 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_fneg_fabs_f32:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_fneg_fabs_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, -|v0|, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_fneg_fabs_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1469,6 +1566,20 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_fneg_fabs_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_fneg_fabs_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1483,6 +1594,20 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_fneg_fabs_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_fneg_fabs_f32:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1503,10 +1628,11 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -|v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, -|v0|, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1542,19 +1668,19 @@ define float @v_log2_fneg_f32(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_fneg_f32:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_fneg_f32:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, -v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_fneg_f32:
 ; VI-SDAG:       ; %bb.0:
@@ -1570,6 +1696,20 @@ define float @v_log2_fneg_f32(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_fneg_f32:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_fneg_f32:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1584,6 +1724,20 @@ define float @v_log2_fneg_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_fneg_f32:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_fneg_f32:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1604,10 +1758,11 @@ define float @v_log2_fneg_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1642,19 +1797,19 @@ define float @v_log2_f32_fast(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_fast:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_fast:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_fast:
 ; VI-SDAG:       ; %bb.0:
@@ -1670,6 +1825,20 @@ define float @v_log2_f32_fast(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_fast:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1684,6 +1853,20 @@ define float @v_log2_f32_fast(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_fast:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_fast:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1703,10 +1886,12 @@ define float @v_log2_f32_fast(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1740,19 +1925,19 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_unsafe_math_attr:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_unsafe_math_attr:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
 ; VI-SDAG:       ; %bb.0:
@@ -1768,6 +1953,20 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_unsafe_math_attr:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1782,6 +1981,20 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_unsafe_math_attr:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1801,10 +2014,12 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1838,19 +2053,19 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_approx_fn_attr:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_approx_fn_attr:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_approx_fn_attr:
 ; VI-SDAG:       ; %bb.0:
@@ -1866,6 +2081,20 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_approx_fn_attr:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_approx_fn_attr:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1880,6 +2109,20 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_approx_fn_attr:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_approx_fn_attr:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1899,10 +2142,12 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -1936,19 +2181,19 @@ define float @v_log2_f32_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_ninf:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_ninf:
 ; VI-SDAG:       ; %bb.0:
@@ -1964,6 +2209,20 @@ define float @v_log2_f32_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_ninf:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1978,6 +2237,20 @@ define float @v_log2_f32_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_ninf:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_ninf:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1997,10 +2270,12 @@ define float @v_log2_f32_ninf(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2034,19 +2309,19 @@ define float @v_log2_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_afn:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_afn:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_afn:
 ; VI-SDAG:       ; %bb.0:
@@ -2062,6 +2337,20 @@ define float @v_log2_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_afn:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2076,6 +2365,20 @@ define float @v_log2_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_afn:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_afn:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2095,10 +2398,12 @@ define float @v_log2_f32_afn(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2158,19 +2463,19 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_afn_dynamic:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_afn_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_afn_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -2186,6 +2491,20 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_afn_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_afn_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2200,6 +2519,20 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_afn_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_afn_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2219,10 +2552,12 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2256,19 +2591,19 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_fabs_log2_f32_afn:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_fabs_log2_f32_afn:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e64 v0, |v0|, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_fabs_log2_f32_afn:
 ; VI-SDAG:       ; %bb.0:
@@ -2284,6 +2619,20 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_fabs_log2_f32_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_fabs_log2_f32_afn:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2298,6 +2647,20 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_fabs_log2_f32_afn:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_fabs_log2_f32_afn:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2318,10 +2681,11 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
-; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, |v0|, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, s0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2382,19 +2746,19 @@ define float @v_log2_f32_nnan(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_nnan:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_nnan:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_nnan:
 ; VI-SDAG:       ; %bb.0:
@@ -2410,6 +2774,20 @@ define float @v_log2_f32_nnan(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_nnan:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_nnan:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2424,6 +2802,20 @@ define float @v_log2_f32_nnan(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_nnan:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_nnan:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2443,10 +2835,12 @@ define float @v_log2_f32_nnan(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2506,19 +2900,19 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_nnan_dynamic:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_nnan_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_nnan_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -2534,6 +2928,20 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_nnan_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_nnan_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2548,6 +2956,20 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_nnan_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_nnan_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2567,10 +2989,12 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2630,19 +3054,19 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_ninf_dynamic:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_ninf_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_ninf_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -2658,6 +3082,20 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_ninf_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_ninf_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2672,6 +3110,20 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_ninf_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_ninf_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2691,10 +3143,12 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2728,19 +3182,19 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_nnan_ninf:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_nnan_ninf:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
@@ -2756,6 +3210,20 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_nnan_ninf:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_nnan_ninf:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2770,6 +3238,20 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_nnan_ninf:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_nnan_ninf:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2789,10 +3271,12 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2852,19 +3336,19 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic:
 ; VI-SDAG:       ; %bb.0:
@@ -2880,6 +3364,20 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2894,6 +3392,20 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2913,10 +3425,12 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -2976,19 +3490,19 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX689-GISEL-LABEL: v_log2_f32_dynamic_mode:
-; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX689-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
-; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; SI-GISEL-LABEL: v_log2_f32_dynamic_mode:
+; SI-GISEL:       ; %bb.0:
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_f32_dynamic_mode:
 ; VI-SDAG:       ; %bb.0:
@@ -3004,6 +3518,20 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; VI-GISEL-LABEL: v_log2_f32_dynamic_mode:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; VI-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX900-SDAG-LABEL: v_log2_f32_dynamic_mode:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3018,6 +3546,20 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX900-GISEL-LABEL: v_log2_f32_dynamic_mode:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX900-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1100-SDAG-LABEL: v_log2_f32_dynamic_mode:
 ; GFX1100-SDAG:       ; %bb.0:
 ; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3037,10 +3579,12 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
+; GFX1100-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
+; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
index ba428df273db5..a439f8df10a26 100644
--- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
+++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
@@ -3,32 +3,17 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 define amdgpu_cs float @v_s_exp_f32(float inreg %src) {
-; GFX12-SDAG-LABEL: v_s_exp_f32:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_cmp_lt_f32 s0, 0xc2fc0000
-; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0x42800000, 0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-SDAG-NEXT:    s_add_f32 s0, s0, s1
-; GFX12-SDAG-NEXT:    s_cselect_b32 s1, 0xffffffc0, 0
-; GFX12-SDAG-NEXT:    v_s_exp_f32 s0, s0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
-; GFX12-SDAG-NEXT:    v_ldexp_f32 v0, s0, s1
-; GFX12-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: v_s_exp_f32:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s0, 0xc2fc0000
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x42800000, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    s_add_f32 s0, s0, s1
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x1f800000, 1.0
-; GFX12-GISEL-NEXT:    v_s_exp_f32 s0, s0
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s1
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-LABEL: v_s_exp_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_cmp_lt_f32 s0, 0xc2fc0000
+; GFX12-NEXT:    s_cselect_b32 s1, 0x42800000, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
+; GFX12-NEXT:    s_add_f32 s0, s0, s1
+; GFX12-NEXT:    s_cselect_b32 s1, 0xffffffc0, 0
+; GFX12-NEXT:    v_s_exp_f32 s0, s0
+; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-NEXT:    v_ldexp_f32 v0, s0, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %result = call float @llvm.exp2.f32(float %src)
   ret float %result
 }
@@ -88,16 +73,16 @@ define amdgpu_cs float @v_s_log_f32(float inreg %src) {
 ; GFX12-GISEL-LABEL: v_s_log_f32:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s0, 0x800000
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s1
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x42000000, 0
-; GFX12-GISEL-NEXT:    v_s_log_f32 s0, s0
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    s_sub_f32 s0, s0, s1
+; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    s_lshl_b32 s2, s1, 5
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-GISEL-NEXT:    v_ldexp_f32 v0, s0, s2
+; GFX12-GISEL-NEXT:    s_cselect_b32 s0, 0x42000000, 0
+; GFX12-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT:    v_subrev_f32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %result = call float @llvm.log2.f32(float %src)
   ret float %result
@@ -322,19 +307,18 @@ define amdgpu_cs float @srcmods_abs_f32(float inreg %src) {
 ;
 ; GFX12-GISEL-LABEL: srcmods_abs_f32:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_bitset0_b32 s0, 31
+; GFX12-GISEL-NEXT:    s_and_b32 s1, s0, 0x7fffffff
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s0, 0x800000
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s1
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x42000000, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX12-GISEL-NEXT:    v_s_log_f32 s0, s0
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_sub_f32 s0, s0, s1
+; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s1, 0x800000
+; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-GISEL-NEXT:    s_lshl_b32 s2, s1, 5
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-GISEL-NEXT:    v_ldexp_f32 v0, |s0|, s2
+; GFX12-GISEL-NEXT:    s_cselect_b32 s0, 0x42000000, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    v_subrev_f32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %abs = call float @llvm.fabs.f32(float %src)
   %result = call float @llvm.log2.f32(float %abs)
@@ -362,19 +346,18 @@ define amdgpu_cs float @srcmods_neg_f32(float inreg %src) {
 ;
 ; GFX12-GISEL-LABEL: srcmods_neg_f32:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_xor_b32 s0, s0, 0x80000000
+; GFX12-GISEL-NEXT:    s_xor_b32 s1, s0, 0x80000000
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s0, 0x800000
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x4f800000, 1.0
-; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s1
-; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 0x42000000, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
-; GFX12-GISEL-NEXT:    v_s_log_f32 s0, s0
-; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_sub_f32 s0, s0, s1
+; GFX12-GISEL-NEXT:    s_cmp_lt_f32 s1, 0x800000
+; GFX12-GISEL-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX12-GISEL-NEXT:    s_lshl_b32 s2, s1, 5
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-GISEL-NEXT:    v_ldexp_f32 v0, -s0, s2
+; GFX12-GISEL-NEXT:    s_cselect_b32 s0, 0x42000000, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX12-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    v_subrev_f32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %neg = fneg float %src
   %result = call float @llvm.log2.f32(float %neg)

From ccc8ef6b709dfc22f7cde3ceb5f0bc2b990175ad Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Mon, 6 Jan 2025 07:12:59 -0500
Subject: [PATCH 32/49] [mlir] DCE `RegisteredOperationName::parseAssembly`
 decl (#121730)

---
 mlir/include/mlir/IR/OperationSupport.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h
index ef5b8b178fbc7..f4cc5baa63355 100644
--- a/mlir/include/mlir/IR/OperationSupport.h
+++ b/mlir/include/mlir/IR/OperationSupport.h
@@ -693,9 +693,6 @@ class RegisteredOperationName : public OperationName {
   /// Return the dialect this operation is registered to.
   Dialect &getDialect() const { return *getImpl()->getDialect(); }
 
-  /// Use the specified object to parse this ops custom assembly format.
-  ParseResult parseAssembly(OpAsmParser &parser, OperationState &result) const;
-
   /// Represent the operation name as an opaque pointer. (Used to support
   /// PointerLikeTypeTraits).
   static RegisteredOperationName getFromOpaquePointer(const void *pointer) {

From 9173ed25fb257d5b780b5e21458e213e202b457c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 6 Jan 2025 12:15:01 +0000
Subject: [PATCH 33/49] [PhaseOrdering][X86] Add horizontal-sub test coverage
 for #34072

Matches the existing horizontal-add tests, with the additional non-commutable constraint
---
 .../test/Transforms/PhaseOrdering/X86/hsub.ll | 1155 +++++++++++++++++
 1 file changed, 1155 insertions(+)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/hsub.ll

diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
new file mode 100644
index 0000000000000..db8a774ba20f0
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -0,0 +1,1155 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+
+; PR34072 - failure to canonicalize to (sub (shuffle a, b),(shuffle a, b)) for optimal horizontal sub patterns (with undemanded elements)
+
+;
+; v8i16
+;
+
+define <8 x i16> @sub_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @sub_v8i16_01234567(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %a0 = extractelement <8 x i16> %a, i32 0
+  %a1 = extractelement <8 x i16> %a, i32 1
+  %a2 = extractelement <8 x i16> %a, i32 2
+  %a3 = extractelement <8 x i16> %a, i32 3
+  %a4 = extractelement <8 x i16> %a, i32 4
+  %a5 = extractelement <8 x i16> %a, i32 5
+  %a6 = extractelement <8 x i16> %a, i32 6
+  %a7 = extractelement <8 x i16> %a, i32 7
+  %a01 = sub i16 %a0, %a1
+  %a23 = sub i16 %a2, %a3
+  %a45 = sub i16 %a4, %a5
+  %a67 = sub i16 %a6, %a7
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %b2 = extractelement <8 x i16> %b, i32 2
+  %b3 = extractelement <8 x i16> %b, i32 3
+  %b4 = extractelement <8 x i16> %b, i32 4
+  %b5 = extractelement <8 x i16> %b, i32 5
+  %b6 = extractelement <8 x i16> %b, i32 6
+  %b7 = extractelement <8 x i16> %b, i32 7
+  %b01 = sub i16 %b0, %b1
+  %b23 = sub i16 %b2, %b3
+  %b45 = sub i16 %b4, %b5
+  %b67 = sub i16 %b6, %b7
+  %hsub0 = insertelement <8 x i16> poison, i16 %a01, i32 0
+  %hsub1 = insertelement <8 x i16> %hsub0, i16 %a23, i32 1
+  %hsub2 = insertelement <8 x i16> %hsub1, i16 %a45, i32 2
+  %hsub3 = insertelement <8 x i16> %hsub2, i16 %a67, i32 3
+  %hsub4 = insertelement <8 x i16> %hsub3, i16 %b01, i32 4
+  %hsub5 = insertelement <8 x i16> %hsub4, i16 %b23, i32 5
+  %hsub6 = insertelement <8 x i16> %hsub5, i16 %b45, i32 6
+  %hsub7 = insertelement <8 x i16> %hsub6, i16 %b67, i32 7
+  %result = shufflevector <8 x i16> %hsub7, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %result
+}
+
+define <8 x i16> @sub_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: @sub_v8i16_u1234567(
+; SSE2-NEXT:    [[A2:%.*]] = extractelement <8 x i16> [[A:%.*]], i64 2
+; SSE2-NEXT:    [[A3:%.*]] = extractelement <8 x i16> [[A]], i64 3
+; SSE2-NEXT:    [[A4:%.*]] = extractelement <8 x i16> [[A]], i64 4
+; SSE2-NEXT:    [[A5:%.*]] = extractelement <8 x i16> [[A]], i64 5
+; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x i16> [[A]], i64 6
+; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x i16> [[A]], i64 7
+; SSE2-NEXT:    [[A23:%.*]] = sub i16 [[A2]], [[A3]]
+; SSE2-NEXT:    [[A45:%.*]] = sub i16 [[A4]], [[A5]]
+; SSE2-NEXT:    [[A67:%.*]] = sub i16 [[A6]], [[A7]]
+; SSE2-NEXT:    [[HSUB1:%.*]] = insertelement <8 x i16> poison, i16 [[A23]], i64 1
+; SSE2-NEXT:    [[HSUB2:%.*]] = insertelement <8 x i16> [[HSUB1]], i16 [[A45]], i64 2
+; SSE2-NEXT:    [[HSUB3:%.*]] = insertelement <8 x i16> [[HSUB2]], i16 [[A67]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HSUB3]], <8 x i16> [[TMP3]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:    ret <8 x i16> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v8i16_u1234567(
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; SSE4-NEXT:    [[TMP7:%.*]] = sub <8 x i16> [[TMP5]], [[TMP6]]
+; SSE4-NEXT:    ret <8 x i16> [[TMP7]]
+;
+; AVX-LABEL: @sub_v8i16_u1234567(
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; AVX-NEXT:    [[TMP7:%.*]] = sub <8 x i16> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    ret <8 x i16> [[TMP7]]
+;
+  %a0 = extractelement <8 x i16> %a, i32 0
+  %a1 = extractelement <8 x i16> %a, i32 1
+  %a2 = extractelement <8 x i16> %a, i32 2
+  %a3 = extractelement <8 x i16> %a, i32 3
+  %a4 = extractelement <8 x i16> %a, i32 4
+  %a5 = extractelement <8 x i16> %a, i32 5
+  %a6 = extractelement <8 x i16> %a, i32 6
+  %a7 = extractelement <8 x i16> %a, i32 7
+  %a01 = sub i16 %a0, %a1
+  %a23 = sub i16 %a2, %a3
+  %a45 = sub i16 %a4, %a5
+  %a67 = sub i16 %a6, %a7
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %b2 = extractelement <8 x i16> %b, i32 2
+  %b3 = extractelement <8 x i16> %b, i32 3
+  %b4 = extractelement <8 x i16> %b, i32 4
+  %b5 = extractelement <8 x i16> %b, i32 5
+  %b6 = extractelement <8 x i16> %b, i32 6
+  %b7 = extractelement <8 x i16> %b, i32 7
+  %b01 = sub i16 %b0, %b1
+  %b23 = sub i16 %b2, %b3
+  %b45 = sub i16 %b4, %b5
+  %b67 = sub i16 %b6, %b7
+  %hsub0 = insertelement <8 x i16> poison, i16 %a01, i32 0
+  %hsub1 = insertelement <8 x i16> %hsub0, i16 %a23, i32 1
+  %hsub2 = insertelement <8 x i16> %hsub1, i16 %a45, i32 2
+  %hsub3 = insertelement <8 x i16> %hsub2, i16 %a67, i32 3
+  %hsub4 = insertelement <8 x i16> %hsub3, i16 %b01, i32 4
+  %hsub5 = insertelement <8 x i16> %hsub4, i16 %b23, i32 5
+  %hsub6 = insertelement <8 x i16> %hsub5, i16 %b45, i32 6
+  %hsub7 = insertelement <8 x i16> %hsub6, i16 %b67, i32 7
+  %result = shufflevector <8 x i16> %hsub7, <8 x i16> %a, <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %result
+}
+
+;
+; v4i32
+;
+
+define <4 x i32> @sub_v4i32_0123(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_0123(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_u123(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_0u23(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: @sub_v4i32_01u3(
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; SSE2-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; SSE2-NEXT:    ret <4 x i32> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v4i32_01u3(
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; SSE4-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; SSE4-NEXT:    ret <4 x i32> [[TMP4]]
+;
+; AVX2-LABEL: @sub_v4i32_01u3(
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; AVX2-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    ret <4 x i32> [[TMP4]]
+;
+; AVX512-LABEL: @sub_v4i32_01u3(
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; AVX512-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; AVX512-NEXT:    ret <4 x i32> [[TMP4]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_012u(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_012u(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_uu23(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_uu23(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+; CHECK-NEXT:    [[RESULT1:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_01uu(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_01uu(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  ret <4 x i32> %result
+}
+
+;
+; v8i32
+;
+
+define <8 x i32> @sub_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: @sub_v8i32_01234567(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %a45 = sub i32 %a4, %a5
+  %a67 = sub i32 %a6, %a7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %b45 = sub i32 %b4, %b5
+  %b67 = sub i32 %b6, %b7
+  %hsub0 = insertelement <8 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <8 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <8 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <8 x i32> %hsub2, i32 %b23, i32 3
+  %hsub4 = insertelement <8 x i32> %hsub3, i32 %a45, i32 4
+  %hsub5 = insertelement <8 x i32> %hsub4, i32 %a67, i32 5
+  %hsub6 = insertelement <8 x i32> %hsub5, i32 %b45, i32 6
+  %hsub7 = insertelement <8 x i32> %hsub6, i32 %b67, i32 7
+  %result = shufflevector <8 x i32> %hsub7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %result
+}
+
+define <8 x i32> @sub_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: @sub_v8i32_01234u67(
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
+; SSE2-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[TMP3]]
+; SSE2-NEXT:    ret <8 x i32> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v8i32_01234u67(
+; SSE4-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
+; SSE4-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; SSE4-NEXT:    [[A45:%.*]] = sub i32 [[A4]], [[A5]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[HSUB4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = sub <8 x i32> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HSUB4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
+; SSE4-NEXT:    ret <8 x i32> [[RESULT]]
+;
+; AVX-LABEL: @sub_v8i32_01234u67(
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
+; AVX-NEXT:    [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    ret <8 x i32> [[TMP7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %a01 = sub i32 %a0, %a1
+  %a23 = sub i32 %a2, %a3
+  %a45 = sub i32 %a4, %a5
+  %a67 = sub i32 %a6, %a7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %b01 = sub i32 %b0, %b1
+  %b23 = sub i32 %b2, %b3
+  %b45 = sub i32 %b4, %b5
+  %b67 = sub i32 %b6, %b7
+  %hsub0 = insertelement <8 x i32> poison, i32 %a01, i32 0
+  %hsub1 = insertelement <8 x i32> %hsub0, i32 %a23, i32 1
+  %hsub2 = insertelement <8 x i32> %hsub1, i32 %b01, i32 2
+  %hsub3 = insertelement <8 x i32> %hsub2, i32 %b23, i32 3
+  %hsub4 = insertelement <8 x i32> %hsub3, i32 %a45, i32 4
+  %hsub5 = insertelement <8 x i32> %hsub4, i32 %a67, i32 5
+  %hsub6 = insertelement <8 x i32> %hsub5, i32 %b45, i32 6
+  %hsub7 = insertelement <8 x i32> %hsub6, i32 %b67, i32 7
+  %result = shufflevector <8 x i32> %hsub7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 6, i32 7>
+  ret <8 x i32> %result
+}
+
+;
+; v4f32
+;
+
+define <4 x float> @sub_v4f32_0123(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_0123(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_u123(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_u123(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_0u23(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_0u23(
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_01u3(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: @sub_v4f32_01u3(
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; SSE2-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; SSE2-NEXT:    ret <4 x float> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v4f32_01u3(
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; SSE4-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; SSE4-NEXT:    ret <4 x float> [[TMP4]]
+;
+; AVX2-LABEL: @sub_v4f32_01u3(
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; AVX2-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    ret <4 x float> [[TMP4]]
+;
+; AVX512-LABEL: @sub_v4f32_01u3(
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; AVX512-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; AVX512-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_012u(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: @sub_v4f32_012u(
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; SSE2-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; SSE2-NEXT:    ret <4 x float> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v4f32_012u(
+; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[B]], [[SHIFT]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
+; SSE4-NEXT:    ret <4 x float> [[RESULT]]
+;
+; AVX2-LABEL: @sub_v4f32_012u(
+; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[B]], [[SHIFT]]
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
+; AVX2-NEXT:    ret <4 x float> [[RESULT]]
+;
+; AVX512-LABEL: @sub_v4f32_012u(
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; AVX512-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; AVX512-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_uu23(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_uu23(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+; CHECK-NEXT:    [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_01uu(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_01uu(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  ret <4 x float> %result
+}
+
+;
+; v8f32
+;
+
+define <8 x float> @sub_v8f32_01234567(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: @sub_v8f32_01234567(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %a45 = fsub float %a4, %a5
+  %a67 = fsub float %a6, %a7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %b45 = fsub float %b4, %b5
+  %b67 = fsub float %b6, %b7
+  %hsub0 = insertelement <8 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <8 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <8 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <8 x float> %hsub2, float %b23, i32 3
+  %hsub4 = insertelement <8 x float> %hsub3, float %a45, i32 4
+  %hsub5 = insertelement <8 x float> %hsub4, float %a67, i32 5
+  %hsub6 = insertelement <8 x float> %hsub5, float %b45, i32 6
+  %hsub7 = insertelement <8 x float> %hsub6, float %b67, i32 7
+  %result = shufflevector <8 x float> %hsub7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %result
+}
+
+define <8 x float> @sub_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
+; SSE2-LABEL: @sub_v8f32_012u4567(
+; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE2-NEXT:    [[A67:%.*]] = fsub float [[A6]], [[A7]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[HSUB5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:    ret <8 x float> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v8f32_012u4567(
+; SSE4-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE4-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE4-NEXT:    [[A67:%.*]] = fsub float [[A6]], [[A7]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[HSUB5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE4-NEXT:    ret <8 x float> [[RESULT]]
+;
+; AVX-LABEL: @sub_v8f32_012u4567(
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    ret <8 x float> [[TMP7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %a01 = fsub float %a0, %a1
+  %a23 = fsub float %a2, %a3
+  %a45 = fsub float %a4, %a5
+  %a67 = fsub float %a6, %a7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %b01 = fsub float %b0, %b1
+  %b23 = fsub float %b2, %b3
+  %b45 = fsub float %b4, %b5
+  %b67 = fsub float %b6, %b7
+  %hsub0 = insertelement <8 x float> poison, float %a01, i32 0
+  %hsub1 = insertelement <8 x float> %hsub0, float %a23, i32 1
+  %hsub2 = insertelement <8 x float> %hsub1, float %b01, i32 2
+  %hsub3 = insertelement <8 x float> %hsub2, float %b23, i32 3
+  %hsub4 = insertelement <8 x float> %hsub3, float %a45, i32 4
+  %hsub5 = insertelement <8 x float> %hsub4, float %a67, i32 5
+  %hsub6 = insertelement <8 x float> %hsub5, float %b45, i32 6
+  %hsub7 = insertelement <8 x float> %hsub6, float %b67, i32 7
+  %result = shufflevector <8 x float> %hsub7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %result
+}
+
+;
+; v2f64
+;
+
+define <2 x double> @sub_v2f64_01(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @sub_v2f64_01(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[TMP3]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %a01 = fsub double %a0, %a1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %b01 = fsub double %b0, %b1
+  %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
+  %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %result
+}
+
+define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @sub_v2f64_u1(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[B]], [[SHIFT]]
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    ret <2 x double> [[RESULT]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %a01 = fsub double %a0, %a1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %b01 = fsub double %b0, %b1
+  %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
+  %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 poison, i32 1>
+  ret <2 x double> %result
+}
+
+define <2 x double> @sub_v2f64_0u(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @sub_v2f64_0u(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    ret <2 x double> [[RESULT]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %a01 = fsub double %a0, %a1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %b01 = fsub double %b0, %b1
+  %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
+  %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 0, i32 poison>
+  ret <2 x double> %result
+}
+
+;
+; v4f64
+;
+
+define <4 x double> @sub_v4f64_0123(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: @sub_v4f64_0123(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_u123(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_u123(
+; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <2 x i32> <i32 0, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <2 x i32> <i32 1, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_u123(
+; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_u123(
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
+; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: @sub_v4f64_0u23(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; SSE-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
+; SSE-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_0u23(
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 3, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_01u3(
+; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_01u3(
+; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_01u3(
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_012u(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_012u(
+; SSE2-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
+; SSE2-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE2-NEXT:    [[A23:%.*]] = fsub double [[A2]], [[A3]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[A23]], i64 2
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_012u(
+; SSE4-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
+; SSE4-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE4-NEXT:    [[A23:%.*]] = fsub double [[A2]], [[A3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_012u(
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_uu23(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_uu23(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 2, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; SSE2-NEXT:    ret <4 x double> [[RESULT1]]
+;
+; SSE4-LABEL: @sub_v4f64_uu23(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; SSE4-NEXT:    [[RESULT1:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    ret <4 x double> [[RESULT1]]
+;
+; AVX-LABEL: @sub_v4f64_uu23(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; AVX-NEXT:    [[RESULT1:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[RESULT1]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_01uu(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_01uu(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    ret <4 x double> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v4f64_01uu(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    ret <4 x double> [[TMP3]]
+;
+; AVX-LABEL: @sub_v4f64_01uu(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fsub double %a0, %a1
+  %a23 = fsub double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fsub double %b0, %b1
+  %b23 = fsub double %b2, %b3
+  %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+  %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+  %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  ret <4 x double> %result
+}

From 4bb2ce308a56b40bad465485ecc614c4103b6e95 Mon Sep 17 00:00:00 2001
From: Yihe Li <winmikedows@hotmail.com>
Date: Mon, 6 Jan 2025 20:25:40 +0800
Subject: [PATCH 34/49] [clang] Fix missing check for nullptr in
 CallExpr::getUnusedResultAttr (#118636)

Fixes #117975, a regression introduced by #112521 due to forgetting
to check for `nullptr` before dereferencing in
`CallExpr::getUnusedResultAttr`.
---
 clang/lib/AST/Expr.cpp                    | 6 +++---
 clang/test/SemaCXX/warn-unused-result.cpp | 9 +++++++++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 8c8ccdb61dc01..ba66d36278567 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -1618,9 +1618,9 @@ QualType CallExpr::getCallReturnType(const ASTContext &Ctx) const {
 std::pair<const NamedDecl *, const Attr *>
 CallExpr::getUnusedResultAttr(const ASTContext &Ctx) const {
   // If the callee is marked nodiscard, return that attribute
-  const Decl *D = getCalleeDecl();
-  if (const auto *A = D->getAttr<WarnUnusedResultAttr>())
-    return {nullptr, A};
+  if (const Decl *D = getCalleeDecl())
+    if (const auto *A = D->getAttr<WarnUnusedResultAttr>())
+      return {nullptr, A};
 
   // If the return type is a struct, union, or enum that is marked nodiscard,
   // then return the return type attribute.
diff --git a/clang/test/SemaCXX/warn-unused-result.cpp b/clang/test/SemaCXX/warn-unused-result.cpp
index 682c500dc1d96..5105f347db8b5 100644
--- a/clang/test/SemaCXX/warn-unused-result.cpp
+++ b/clang/test/SemaCXX/warn-unused-result.cpp
@@ -355,3 +355,12 @@ void use2() {
   (void)G{"Hello"};
 }
 } // namespace nodiscard_specialization
+
+namespace GH117975 {
+// Test for a regression for ICE in CallExpr::getUnusedResultAttr
+int f() { return 0; }
+void id_print_name() {
+  (int) // expected-warning {{expression result unused}}
+    ((int(*)())f)();
+}
+} // namespace GH117975

From 70507dbf4bd9a33e22da8d621d4c2ee56d6e0a3d Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Mon, 6 Jan 2025 07:50:11 -0500
Subject: [PATCH 35/49] [clang] Expose -f(no-)wrapv as clang-cl option
 (#120787)

Also move the -fno-wrapv option definition next to the -fwrapv one while
here.
---
 clang/include/clang/Driver/Options.td | 6 +++---
 clang/test/Driver/cl-options.c        | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 12edfbb171d34..640cf1412dd98 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3508,8 +3508,6 @@ def fno_verbose_asm : Flag<["-"], "fno-verbose-asm">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option]>,
   MarshallingInfoNegativeFlag<CodeGenOpts<"AsmVerbose">>;
 def fno_working_directory : Flag<["-"], "fno-working-directory">, Group<f_Group>;
-def fno_wrapv : Flag<["-"], "fno-wrapv">, Group<f_Group>,
-  Visibility<[ClangOption, FlangOption]>;
 def fobjc_arc : Flag<["-"], "fobjc-arc">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option]>,
   HelpText<"Synthesize retain and release calls for Objective-C pointers">;
@@ -4280,8 +4278,10 @@ defm virtual_function_elimination : BoolFOption<"virtual-function-elimination",
   NegFlag<SetFalse>, BothFlags<[], [ClangOption, CLOption]>>;
 
 def fwrapv : Flag<["-"], "fwrapv">, Group<f_Group>,
-  Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
+  Visibility<[ClangOption, CLOption, CC1Option, FlangOption, FC1Option]>,
   HelpText<"Treat signed integer overflow as two's complement">;
+def fno_wrapv : Flag<["-"], "fno-wrapv">, Group<f_Group>,
+  Visibility<[ClangOption, CLOption, FlangOption]>;
 def fwritable_strings : Flag<["-"], "fwritable-strings">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option]>,
   HelpText<"Store string literals as writable data">,
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index c975727839c0b..29a0fcbc17ac6 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -739,6 +739,8 @@
 // RUN:     -fimplicit-modules \
 // RUN:     -fno-implicit-modules \
 // RUN:     -ftrivial-auto-var-init=zero \
+// RUN:     -fwrapv \
+// RUN:     -fno-wrapv \
 // RUN:     --version \
 // RUN:     -Werror /Zs -- %s 2>&1
 

From 52a8fec8d47594a6ebd57f07ea5a969da14eed37 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Mon, 6 Jan 2025 13:17:14 +0000
Subject: [PATCH 36/49] [AArch64] Improve codegen of vectorised early exit
 loops (#119534)

Once PR #112138 lands we are able to start vectorising more loops
that have uncountable early exits. The typical loop structure
looks like this:

vector.body:
  ...
  %pred = icmp eq <2 x ptr> %wide.load, %broadcast.splat
  ...
  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %pred)
  %iv.cmp = icmp eq i64 %index.next, 4
  %exit.cond = or i1 %or.reduc, %iv.cmp
  br i1 %exit.cond, label %middle.split, label %vector.body

middle.split:
  br i1 %or.reduc, label %found, label %notfound

found:
  ret i64 1

notfound:
  ret i64 0

The problem with this is that %or.reduc is kept live after the loop,
and since this is a boolean it typically requires making a copy of
the condition code register. For AArch64 this requires an additional
cset instruction, which is quite expensive for a typical find loop
that only contains 6 or 7 instructions.

This patch attempts to improve the codegen by sinking the reduction
out of the loop to the location of it's user. It's a lot cheaper to
keep the predicate alive if the type is legal and has lots of
registers for it. There is a potential downside in that a little
more work is required after the loop, but I believe this is worth
it since we are likely to spend most of our time in the loop.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  25 ++-
 llvm/test/CodeGen/AArch64/reduce-or-opt.ll    | 193 ++++++++++++++++++
 .../CodeGenPrepare/AArch64/reduce-or-opt.ll   | 189 +++++++++++++++++
 3 files changed, 406 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/reduce-or-opt.ll
 create mode 100644 llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 72cc0bf7e7dd5..5abe69ef1c9cc 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5290,11 +5290,17 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
     }
   }
 
-  // Sink vscales closer to uses for better isel
+  auto ShouldSinkCondition = [](Value *Cond) -> bool {
+    auto *II = dyn_cast<IntrinsicInst>(Cond);
+    return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
+           isa<ScalableVectorType>(II->getOperand(0)->getType());
+  };
+
   switch (I->getOpcode()) {
   case Instruction::GetElementPtr:
   case Instruction::Add:
   case Instruction::Sub:
+    // Sink vscales closer to uses for better isel
     for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
       if (shouldSinkVScale(I->getOperand(Op), Ops)) {
         Ops.push_back(&I->getOperandUse(Op));
@@ -5302,6 +5308,23 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
       }
     }
     break;
+  case Instruction::Select: {
+    if (!ShouldSinkCondition(I->getOperand(0)))
+      return false;
+
+    Ops.push_back(&I->getOperandUse(0));
+    return true;
+  }
+  case Instruction::Br: {
+    if (cast<BranchInst>(I)->isUnconditional())
+      return false;
+
+    if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition()))
+      return false;
+
+    Ops.push_back(&I->getOperandUse(0));
+    return true;
+  }
   default:
     break;
   }
diff --git a/llvm/test/CodeGen/AArch64/reduce-or-opt.ll b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
new file mode 100644
index 0000000000000..f5df5ea53c990
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve | FileCheck %s
+
+define i64 @select_or_reduce_v2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: select_or_reduce_v2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  .LBB0_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr q0, [x0, x8]
+; CHECK-NEXT:    cmeq v0.2d, v0.2d, #0
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    tbnz w9, #0, .LBB0_3
+; CHECK-NEXT:  // %bb.2: // %vector.body
+; CHECK-NEXT:    // in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    cmp x8, #16
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  .LBB0_3: // %middle.split
+; CHECK-NEXT:    and x0, x9, #0x1
+; CHECK-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, 2
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  %sel = select i1 %or.reduc, i64 1, i64 0
+  ret i64 %sel
+}
+
+define i64 @br_or_reduce_v2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: br_or_reduce_v2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:  .LBB1_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr q0, [x0, x8]
+; CHECK-NEXT:    cmeq v0.2d, v0.2d, #0
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    tbnz w9, #0, .LBB1_3
+; CHECK-NEXT:  // %bb.2: // %vector.body
+; CHECK-NEXT:    // in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    cmp x8, #16
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    b.ne .LBB1_1
+; CHECK-NEXT:  .LBB1_3: // %middle.split
+; CHECK-NEXT:    tbz w9, #0, .LBB1_5
+; CHECK-NEXT:  // %bb.4: // %found
+; CHECK-NEXT:    mov w8, #56 // =0x38
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_5:
+; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, 2
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  br i1 %or.reduc, label %found, label %notfound
+
+found:
+  store i64 56, ptr %p, align 8
+  ret i64 1
+
+notfound:
+  ret i64 0
+}
+
+define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: select_or_reduce_nxv2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    neg x10, x8
+; CHECK-NEXT:    add x10, x10, #4
+; CHECK-NEXT:  .LBB2_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    b.ne .LBB2_3
+; CHECK-NEXT:  // %bb.2: // %vector.body
+; CHECK-NEXT:    // in Loop: Header=BB2_1 Depth=1
+; CHECK-NEXT:    cmp x10, x9
+; CHECK-NEXT:    add x9, x9, x8
+; CHECK-NEXT:    b.ne .LBB2_1
+; CHECK-NEXT:  .LBB2_3: // %middle.split
+; CHECK-NEXT:    ptest p0, p1.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vf = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, %vf
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  %sel = select i1 %or.reduc, i64 1, i64 0
+  ret i64 %sel
+}
+
+define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: br_or_reduce_nxv2i1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    neg x10, x8
+; CHECK-NEXT:    add x10, x10, #4
+; CHECK-NEXT:  .LBB3_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    b.ne .LBB3_3
+; CHECK-NEXT:  // %bb.2: // %vector.body
+; CHECK-NEXT:    // in Loop: Header=BB3_1 Depth=1
+; CHECK-NEXT:    cmp x10, x9
+; CHECK-NEXT:    add x9, x9, x8
+; CHECK-NEXT:    b.ne .LBB3_1
+; CHECK-NEXT:  .LBB3_3: // %middle.split
+; CHECK-NEXT:    ptest p0, p1.b
+; CHECK-NEXT:    b.eq .LBB3_5
+; CHECK-NEXT:  // %bb.4: // %found
+; CHECK-NEXT:    mov w8, #56 // =0x38
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB3_5:
+; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:    ret
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vf = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, %vf
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  br i1 %or.reduc, label %found, label %notfound
+
+found:
+  store i64 56, ptr %p, align 8
+  ret i64 1
+
+notfound:
+  ret i64 0
+}
+
+declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1>)
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll
new file mode 100644
index 0000000000000..52257c10b0bf6
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/reduce-or-opt.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -codegenprepare -S < %s -mtriple=aarch64-none-linux-gnu -mattr=+sve | FileCheck %s
+
+define i64 @select_or_reduce_v2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: define i64 @select_or_reduce_v2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[OR_REDUC]], i64 1, i64 0
+; CHECK-NEXT:    ret i64 [[SEL]]
+;
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, 2
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  %sel = select i1 %or.reduc, i64 1, i64 0
+  ret i64 %sel
+}
+
+define i64 @br_or_reduce_v2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: define i64 @br_or_reduce_v2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]], ptr noundef readnone [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    br i1 [[OR_REDUC]], label %[[FOUND:.*]], label %[[NOTFOUND:.*]]
+; CHECK:       [[FOUND]]:
+; CHECK-NEXT:    store i64 56, ptr [[P]], align 8
+; CHECK-NEXT:    ret i64 1
+; CHECK:       [[NOTFOUND]]:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, 2
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  br i1 %or.reduc, label %found, label %notfound
+
+found:
+  store i64 56, ptr %p, align 8
+  ret i64 1
+
+notfound:
+  ret i64 0
+}
+
+define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) {
+; CHECK-LABEL: define i64 @select_or_reduce_nxv2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP2]])
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[TMP3]], i64 1, i64 0
+; CHECK-NEXT:    ret i64 [[SEL]]
+;
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vf = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, %vf
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  %sel = select i1 %or.reduc, i64 1, i64 0
+  ret i64 %sel
+}
+
+define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef readnone %p) {
+; CHECK-LABEL: define i64 @br_or_reduce_nxv2i1(
+; CHECK-SAME: ptr nocapture noundef readonly [[SRC:%.*]], ptr noundef readnone [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x ptr>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[OR_REDUC:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[COND]])
+; CHECK-NEXT:    [[IV_CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = or i1 [[OR_REDUC]], [[IV_CMP]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[TMP2]])
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[FOUND:.*]], label %[[NOTFOUND:.*]]
+; CHECK:       [[FOUND]]:
+; CHECK-NEXT:    store i64 56, ptr [[P]], align 8
+; CHECK-NEXT:    ret i64 1
+; CHECK:       [[NOTFOUND]]:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vf = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %arrayidx = getelementptr inbounds ptr, ptr %src, i64 %index
+  %wide.load = load <vscale x 2 x ptr>, ptr %arrayidx, align 8
+  %cond = icmp eq <vscale x 2 x ptr> %wide.load, splat(ptr zeroinitializer)
+  %index.next = add nuw i64 %index, %vf
+  %or.reduc = tail call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> %cond)
+  %iv.cmp = icmp eq i64 %index.next, 4
+  %exit.cond = or i1 %or.reduc, %iv.cmp
+  br i1 %exit.cond, label %middle.split, label %vector.body
+
+middle.split:
+  br i1 %or.reduc, label %found, label %notfound
+
+found:
+  store i64 56, ptr %p, align 8
+  ret i64 1
+
+notfound:
+  ret i64 0
+}
+
+declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1>)

From 0bc561caeb5eba2bf35dabd607024e383fafe048 Mon Sep 17 00:00:00 2001
From: JoelWee <32009741+JoelWee@users.noreply.github.com>
Date: Mon, 6 Jan 2025 13:17:37 +0000
Subject: [PATCH 37/49] Fix after #121482 (#121764)

---
 libc/src/stdlib/qsort_pivot.h                     | 2 +-
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/libc/src/stdlib/qsort_pivot.h b/libc/src/stdlib/qsort_pivot.h
index b7e1b4294f6d6..b27e74663d901 100644
--- a/libc/src/stdlib/qsort_pivot.h
+++ b/libc/src/stdlib/qsort_pivot.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
 #define LLVM_LIBC_SRC_STDLIB_QSORT_PIVOT_H
 
-#include <stdint.h>
+#include <stddef.h>  // For size_t
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 15fa4123b75fe..81309f1e373ac 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -3481,11 +3481,13 @@ libc_support_library(
     hdrs = [
         "src/stdlib/heap_sort.h",
         "src/stdlib/qsort_data.h",
+        "src/stdlib/qsort_pivot.h",
         "src/stdlib/qsort_util.h",
         "src/stdlib/quick_sort.h",
     ],
     deps = [
         ":__support_common",
+        ":__support_cpp_bit",
         ":__support_cpp_cstddef",
         ":__support_macros_attributes",
     ],

From ef1ba52042ed05d5dbe0b0c31dbaac88801d9123 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Mon, 6 Jan 2025 21:28:58 +0800
Subject: [PATCH 38/49] [X86] Support lowering of FMINIMUMNUM/FMAXIMUMNUM
 (#121464)

---
 .../SelectionDAG/LegalizeVectorOps.cpp        |    6 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |    2 +
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   40 +-
 llvm/test/CodeGen/AMDGPU/maximumnum.ll        |  120 +-
 llvm/test/CodeGen/AMDGPU/minimumnum.ll        |  120 +-
 .../CodeGen/X86/fminimumnum-fmaximumnum.ll    | 2765 +++++++++++++++++
 6 files changed, 2928 insertions(+), 125 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 39903bde25a62..154c8aea6bcd1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -402,6 +402,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FMAXNUM_IEEE:
   case ISD::FMINIMUM:
   case ISD::FMAXIMUM:
+  case ISD::FMINIMUMNUM:
+  case ISD::FMAXIMUMNUM:
   case ISD::FCOPYSIGN:
   case ISD::FSQRT:
   case ISD::FSIN:
@@ -1081,6 +1083,10 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   case ISD::FMAXIMUM:
     Results.push_back(TLI.expandFMINIMUM_FMAXIMUM(Node, DAG));
     return;
+  case ISD::FMINIMUMNUM:
+  case ISD::FMAXIMUMNUM:
+    Results.push_back(TLI.expandFMINIMUMNUM_FMAXIMUMNUM(Node, DAG));
+    return;
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 107454a92e356..780eba16c9c49 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -149,6 +149,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FMAXNUM_IEEE:
   case ISD::FMINIMUM:
   case ISD::FMAXIMUM:
+  case ISD::FMINIMUMNUM:
+  case ISD::FMAXIMUMNUM:
   case ISD::FLDEXP:
   case ISD::ABDS:
   case ISD::ABDU:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 07b9a30b57564..68bdeb1cebeb9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -632,6 +632,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FMAXNUM, VT, Action);
     setOperationAction(ISD::FMINIMUM, VT, Action);
     setOperationAction(ISD::FMAXIMUM, VT, Action);
+    setOperationAction(ISD::FMINIMUMNUM, VT, Action);
+    setOperationAction(ISD::FMAXIMUMNUM, VT, Action);
     setOperationAction(ISD::FSIN, VT, Action);
     setOperationAction(ISD::FCOS, VT, Action);
     setOperationAction(ISD::FSINCOS, VT, Action);
@@ -1075,6 +1077,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::FMAXIMUM,           MVT::f32, Custom);
     setOperationAction(ISD::FMINIMUM,           MVT::f32, Custom);
+    setOperationAction(ISD::FMAXIMUMNUM,        MVT::f32, Custom);
+    setOperationAction(ISD::FMINIMUMNUM,        MVT::f32, Custom);
 
     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
@@ -1117,6 +1121,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
       setOperationAction(ISD::FMAXIMUM, VT, Custom);
       setOperationAction(ISD::FMINIMUM, VT, Custom);
+      setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
+      setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
     }
 
     for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
@@ -1482,6 +1488,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
       setOperationAction(ISD::FMAXIMUM,          VT, Custom);
       setOperationAction(ISD::FMINIMUM,          VT, Custom);
+      setOperationAction(ISD::FMAXIMUMNUM,       VT, Custom);
+      setOperationAction(ISD::FMINIMUMNUM,       VT, Custom);
       setOperationAction(ISD::FCANONICALIZE, VT, Custom);
     }
 
@@ -1827,6 +1835,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
       setOperationAction(ISD::FMAXIMUM, VT, Custom);
       setOperationAction(ISD::FMINIMUM, VT, Custom);
+      setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
+      setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
       setOperationAction(ISD::FNEG,  VT, Custom);
       setOperationAction(ISD::FABS,  VT, Custom);
       setOperationAction(ISD::FMA,   VT, Legal);
@@ -2298,6 +2308,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::STRICT_FP_ROUND,      MVT::f16, Custom);
     setOperationAction(ISD::FMAXIMUM,             MVT::f16, Custom);
     setOperationAction(ISD::FMINIMUM,             MVT::f16, Custom);
+    setOperationAction(ISD::FMAXIMUMNUM,          MVT::f16, Custom);
+    setOperationAction(ISD::FMINIMUMNUM,          MVT::f16, Custom);
     setOperationAction(ISD::FP_EXTEND,            MVT::f32, Legal);
     setOperationAction(ISD::STRICT_FP_EXTEND,     MVT::f32, Legal);
 
@@ -2345,6 +2357,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
       setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
       setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);
+      setOperationAction(ISD::FMINIMUMNUM, MVT::v32f16, Custom);
+      setOperationAction(ISD::FMAXIMUMNUM, MVT::v32f16, Custom);
     }
 
     if (Subtarget.hasVLX()) {
@@ -2392,9 +2406,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
       setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
       setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);
+      setOperationAction(ISD::FMINIMUMNUM, MVT::v8f16, Custom);
+      setOperationAction(ISD::FMAXIMUMNUM, MVT::v8f16, Custom);
 
       setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);
       setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);
+      setOperationAction(ISD::FMINIMUMNUM, MVT::v16f16, Custom);
+      setOperationAction(ISD::FMAXIMUMNUM, MVT::v16f16, Custom);
     }
   }
 
@@ -2453,6 +2471,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SETCC, VT, Custom);
       setOperationAction(ISD::FMINIMUM, VT, Custom);
       setOperationAction(ISD::FMAXIMUM, VT, Custom);
+      setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
+      setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
     }
     if (Subtarget.hasAVX10_2_512()) {
       setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
@@ -2464,6 +2484,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
       setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
       setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
+      setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
+      setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
     }
     for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
       setCondCodeAction(ISD::SETOEQ, VT, Custom);
@@ -28850,13 +28872,15 @@ static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
 
 static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
-  assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) &&
-         "Expected FMAXIMUM or FMINIMUM opcode");
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = Op.getValueType();
   SDValue X = Op.getOperand(0);
   SDValue Y = Op.getOperand(1);
   SDLoc DL(Op);
+  bool IsMaxOp =
+      Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
+  bool IsNum =
+      Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
   if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
     unsigned Opc = 0;
     if (VT.isVector())
@@ -28866,7 +28890,7 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
 
     if (Opc) {
       SDValue Imm =
-          DAG.getTargetConstant(Op.getOpcode() == ISD::FMAXIMUM, DL, MVT::i32);
+          DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
       return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
     }
   }
@@ -28876,7 +28900,7 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
   APInt OppositeZero = PreferredZero;
   EVT IVT = VT.changeTypeToInteger();
   X86ISD::NodeType MinMaxOp;
-  if (Op.getOpcode() == ISD::FMAXIMUM) {
+  if (IsMaxOp) {
     MinMaxOp = X86ISD::FMAX;
     OppositeZero.setSignBit();
   } else {
@@ -29006,7 +29030,9 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
   if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))
     return MinMax;
 
-  SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO);
+  SDValue IsNaN =
+      DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
+
   return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
 }
 
@@ -33264,6 +33290,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::UMIN:               return LowerMINMAX(Op, Subtarget, DAG);
   case ISD::FMINIMUM:
   case ISD::FMAXIMUM:
+  case ISD::FMINIMUMNUM:
+  case ISD::FMAXIMUMNUM:
     return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
   case ISD::ABS:                return LowerABS(Op, Subtarget, DAG);
   case ISD::ABDS:
@@ -46027,6 +46055,8 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
   case ISD::FMAXNUM_IEEE:
   case ISD::FMAXIMUM:
   case ISD::FMINIMUM:
+  case ISD::FMAXIMUMNUM:
+  case ISD::FMINIMUMNUM:
   case X86ISD::FMAX:
   case X86ISD::FMIN:
   case ISD::FABS: // Begin 1 operand
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index 5e46fd6b28d27..fa15a42aef2ac 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -1838,11 +1838,11 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_max_f16_e32 v1, v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maximumnum_v3f16:
@@ -1904,8 +1904,8 @@ define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maximumnum_v3f16_nnan:
@@ -1947,20 +1947,20 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX8-LABEL: v_maximumnum_v4f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v6, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maximumnum_v4f16:
@@ -2020,12 +2020,12 @@ define <4 x half> @v_maximumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
 ; GFX8-LABEL: v_maximumnum_v4f16_nnan:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maximumnum_v4f16_nnan:
@@ -2067,27 +2067,27 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX8-LABEL: v_maximumnum_v6f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v6, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v6, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v7, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
-; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v7, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_sdwa v8, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v5
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_max_f16_sdwa v3, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v4, v5, v5
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v4
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maximumnum_v6f16:
@@ -2159,34 +2159,34 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX8-LABEL: v_maximumnum_v8f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v8, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v8, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v10, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_sdwa v10, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v11, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_sdwa v11, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v12, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v6, v6, v6
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v7
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v6
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v5
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v5
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX8-NEXT:    v_max_f16_sdwa v4, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v7
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v9
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v8
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_maximumnum_v8f16:
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index 9e0b7daf38de1..f5fb85d63b8e4 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -1792,11 +1792,11 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX8-NEXT:    v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_min_f16_e32 v1, v1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minimumnum_v3f16:
@@ -1858,8 +1858,8 @@ define <3 x half> @v_minimumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minimumnum_v3f16_nnan:
@@ -1901,20 +1901,20 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX8-LABEL: v_minimumnum_v4f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v6, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_min_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minimumnum_v4f16:
@@ -1974,12 +1974,12 @@ define <4 x half> @v_minimumnum_v4f16_nnan(<4 x half> %x, <4 x half> %y) {
 ; GFX8-LABEL: v_minimumnum_v4f16_nnan:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
-; GFX8-NEXT:    v_min_f16_sdwa v2, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_f16_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minimumnum_v4f16_nnan:
@@ -2021,27 +2021,27 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX8-LABEL: v_minimumnum_v6f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v6, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v6, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v7, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
-; GFX8-NEXT:    v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v7, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_sdwa v8, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_min_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v2, v2, v5
 ; GFX8-NEXT:    v_min_f16_e32 v1, v1, v4
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX8-NEXT:    v_max_f16_sdwa v3, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v4, v5, v5
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:    v_min_f16_e32 v2, v2, v4
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v7
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minimumnum_v6f16:
@@ -2113,34 +2113,34 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX8-LABEL: v_minimumnum_v8f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v8, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_max_f16_sdwa v8, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_min_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX8-NEXT:    v_max_f16_sdwa v4, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v9, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v10, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_sdwa v10, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v11, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_min_f16_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_sdwa v11, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_sdwa v12, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v7, v7, v7
+; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX8-NEXT:    v_max_f16_e32 v6, v6, v6
+; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GFX8-NEXT:    v_max_f16_e32 v5, v5, v5
 ; GFX8-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT:    v_min_f16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX8-NEXT:    v_min_f16_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v3, v3, v7
+; GFX8-NEXT:    v_min_f16_e32 v2, v2, v6
 ; GFX8-NEXT:    v_min_f16_e32 v1, v1, v5
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
-; GFX8-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX8-NEXT:    v_min_f16_e32 v2, v2, v5
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX8-NEXT:    v_max_f16_sdwa v4, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_sdwa v5, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v7, v7
-; GFX8-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT:    v_min_f16_e32 v3, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v4
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v9
+; GFX8-NEXT:    v_or_b32_e32 v3, v3, v8
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_minimumnum_v8f16:
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
new file mode 100644
index 0000000000000..2e9e8e62b3569
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -0,0 +1,2765 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2     | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX10_2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx        | FileCheck %s --check-prefixes=X86
+
+declare float @llvm.maximumnum.f32(float, float)
+declare double @llvm.maximumnum.f64(double, double)
+declare float @llvm.minimumnum.f32(float, float)
+declare double @llvm.minimumnum.f64(double, double)
+declare <2 x double> @llvm.minimumnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>)
+declare <4 x half> @llvm.maximumnum.v4f16(<4 x half>, <4 x half>)
+declare <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
+
+;
+; fmaximumnum
+;
+
+define float @test_fmaximumnum(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    js .LBB0_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:  .LBB0_2:
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    cmpordss %xmm3, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    andps %xmm3, %xmm4
+; SSE2-NEXT:    js .LBB0_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:  .LBB0_4:
+; SSE2-NEXT:    maxss %xmm1, %xmm3
+; SSE2-NEXT:    andnps %xmm3, %xmm0
+; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB0_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:    jmp .LBB0_3
+; AVX1-NEXT:  .LBB0_1:
+; AVX1-NEXT:    vmovdqa %xmm1, %xmm2
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:  .LBB0_3:
+; AVX1-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vcmpordss %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    testl %eax, %eax
+; AVX512-NEXT:    sets %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $17, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm2, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB0_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovdqa %xmm2, %xmm1
+; X86-NEXT:    jmp .LBB0_3
+; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    vmovdqa %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm2, %xmm0
+; X86-NEXT:  .LBB0_3:
+; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.maximumnum.f32(float %x, float %y)
+  ret float %1
+}
+
+define <4 x float> @test_fmaximumnum_scalarize(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
+; SSE2-LABEL: test_fmaximumnum_scalarize:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    maxps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_scalarize:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_scalarize:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_scalarize:
+; X86:       # %bb.0:
+; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %y)
+  ret <4 x float> %r
+}
+
+define float @test_fmaximumnum_nan0(float %x, float %y) {
+; SSE2-LABEL: test_fmaximumnum_nan0:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_nan0:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_nan0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovaps %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_nan0:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.maximumnum.f32(float 0x7fff000000000000, float %y)
+  ret float %1
+}
+
+define float @test_fmaximumnum_nan1(float %x, float %y) {
+; SSE2-LABEL: test_fmaximumnum_nan1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_nan1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_nan1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_nan1:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.maximumnum.f32(float %x, float 0x7fff000000000000)
+  ret float %1
+}
+
+define float @test_fmaximumnum_nnan(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_nnan:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    addss %xmm1, %xmm2
+; SSE2-NEXT:    subss %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm2, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    js .LBB4_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    maxss %xmm2, %xmm0
+; SSE2-NEXT:    retq
+; SSE2-NEXT:  .LBB4_1:
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    maxss %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_nnan:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm2, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB4_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+; AVX1-NEXT:  .LBB4_1:
+; AVX1-NEXT:    vmovaps %xmm0, %xmm1
+; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512F-LABEL: test_fmaximumnum_nnan:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vmovd %xmm2, %eax
+; AVX512F-NEXT:    testl %eax, %eax
+; AVX512F-NEXT:    sets %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovaps %xmm2, %xmm1
+; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512F-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512F-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_fmaximumnum_nnan:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; AVX512DQ-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vfpclassss $3, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isPositiveZero(xmm0)
+; AVX512DQ-NEXT:    kmovw %k0, %k1
+; AVX512DQ-NEXT:    vmovaps %xmm2, %xmm1
+; AVX512DQ-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512DQ-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512DQ-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_nnan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vaddss %xmm1, %xmm0, %xmm2
+; AVX10_2-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    vminmaxss $17, %xmm0, %xmm2
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_nnan:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT:    vaddss %xmm0, %xmm2, %xmm1
+; X86-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmovd %xmm1, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB4_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovaps %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB4_3
+; X86-NEXT:  .LBB4_1:
+; X86-NEXT:    vmovaps %xmm0, %xmm2
+; X86-NEXT:    vmovaps %xmm1, %xmm0
+; X86-NEXT:  .LBB4_3:
+; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = fadd nnan float %x, %y
+  %2 = fsub nnan float %x, %y
+  %3 = tail call float @llvm.maximumnum.f32(float %1, float %2)
+  ret float %3
+}
+
+define double @test_fmaximumnum_zero0(double %x, double %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_zero0:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    cmpordsd %xmm1, %xmm0
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    andpd %xmm1, %xmm2
+; SSE2-NEXT:    xorpd %xmm3, %xmm3
+; SSE2-NEXT:    maxsd %xmm3, %xmm1
+; SSE2-NEXT:    andnpd %xmm1, %xmm0
+; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_zero0:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vcmpordsd %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum_zero0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vcmpordsd %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_zero0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; AVX10_2-NEXT:    vminmaxsd $17, %xmm0, %xmm1
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_zero0:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovlpd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.maximumnum.f64(double 0.0, double %y)
+  ret double %1
+}
+
+define double @test_fmaximumnum_zero1(double %x, double %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_zero1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm0, %xmm1
+; SSE2-NEXT:    cmpordsd %xmm0, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm2
+; SSE2-NEXT:    andpd %xmm0, %xmm2
+; SSE2-NEXT:    xorpd %xmm3, %xmm3
+; SSE2-NEXT:    maxsd %xmm3, %xmm0
+; SSE2-NEXT:    andnpd %xmm0, %xmm1
+; SSE2-NEXT:    orpd %xmm2, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_zero1:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum_zero1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vcmpordsd %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmovapd %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_zero1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxsd $17, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_zero1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovlpd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.maximumnum.f64(double %x, double 0.0)
+  ret double %1
+}
+
+define double @test_fmaximumnum_zero2(double %x, double %y) {
+; SSE2-LABEL: test_fmaximumnum_zero2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_zero2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_zero2:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_zero2:
+; X86:       # %bb.0:
+; X86-NEXT:    fldz
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.maximumnum.f64(double 0.0, double -0.0)
+  ret double %1
+}
+
+define float @test_fmaximumnum_nsz(float %x, float %y) "no-signed-zeros-fp-math"="true" nounwind {
+; SSE2-LABEL: test_fmaximumnum_nsz:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    cmpordss %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm3
+; SSE2-NEXT:    andps %xmm0, %xmm3
+; SSE2-NEXT:    maxss %xmm1, %xmm0
+; SSE2-NEXT:    andnps %xmm0, %xmm2
+; SSE2-NEXT:    orps %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_nsz:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum_nsz:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_nsz:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $17, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_nsz:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.maximumnum.f32(float %x, float %y)
+  ret float %1
+}
+
+define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_combine_cmps:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    divss %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    js .LBB9_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movaps %xmm1, %xmm3
+; SSE2-NEXT:  .LBB9_2:
+; SSE2-NEXT:    movaps %xmm3, %xmm2
+; SSE2-NEXT:    cmpordss %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm4
+; SSE2-NEXT:    andps %xmm3, %xmm4
+; SSE2-NEXT:    js .LBB9_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:  .LBB9_4:
+; SSE2-NEXT:    maxss %xmm1, %xmm3
+; SSE2-NEXT:    andnps %xmm3, %xmm2
+; SSE2-NEXT:    orps %xmm4, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_combine_cmps:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB9_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmovaps %xmm0, %xmm2
+; AVX1-NEXT:    jmp .LBB9_3
+; AVX1-NEXT:  .LBB9_1:
+; AVX1-NEXT:    vmovaps %xmm1, %xmm2
+; AVX1-NEXT:    vmovaps %xmm0, %xmm1
+; AVX1-NEXT:  .LBB9_3:
+; AVX1-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    vcmpordss %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512F-LABEL: test_fmaximumnum_combine_cmps:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    testl %eax, %eax
+; AVX512F-NEXT:    sets %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovaps %xmm0, %xmm2
+; AVX512F-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512F-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512F-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_fmaximumnum_combine_cmps:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vfpclassss $3, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isPositiveZero(xmm0)
+; AVX512DQ-NEXT:    kmovw %k0, %k1
+; AVX512DQ-NEXT:    vmovaps %xmm1, %xmm2
+; AVX512DQ-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512DQ-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512DQ-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_combine_cmps:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxss $17, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_combine_cmps:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vmovd %xmm1, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB9_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovaps %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB9_3
+; X86-NEXT:  .LBB9_1:
+; X86-NEXT:    vmovaps %xmm0, %xmm2
+; X86-NEXT:    vmovaps %xmm1, %xmm0
+; X86-NEXT:  .LBB9_3:
+; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = fdiv nnan float %y, %x
+  %2 = tail call float @llvm.maximumnum.f32(float %x, float %1)
+  ret float %2
+}
+
+;
+; fminimumnum
+;
+
+define float @test_fminimumnum(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fminimumnum:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    js .LBB10_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:  .LBB10_2:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    cmpordss %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm4
+; SSE2-NEXT:    andps %xmm3, %xmm4
+; SSE2-NEXT:    js .LBB10_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:  .LBB10_4:
+; SSE2-NEXT:    minss %xmm0, %xmm3
+; SSE2-NEXT:    andnps %xmm3, %xmm2
+; SSE2-NEXT:    orps %xmm4, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fminimumnum:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB10_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmovdqa %xmm1, %xmm2
+; AVX1-NEXT:    jmp .LBB10_3
+; AVX1-NEXT:  .LBB10_1:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX1-NEXT:  .LBB10_3:
+; AVX1-NEXT:    vminss %xmm2, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fminimumnum:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    testl %eax, %eax
+; AVX512-NEXT:    sets %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovaps %xmm1, %xmm2
+; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vminss %xmm2, %xmm0, %xmm1
+; AVX512-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $16, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB10_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovdqa %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB10_3
+; X86-NEXT:  .LBB10_1:
+; X86-NEXT:    vmovdqa %xmm0, %xmm2
+; X86-NEXT:    vmovdqa %xmm1, %xmm0
+; X86-NEXT:  .LBB10_3:
+; X86-NEXT:    vminss %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.minimumnum.f32(float %x, float %y)
+  ret float %1
+}
+
+define <2 x double> @test_fminimumnum_scalarize(<2 x double> %x, <2 x double> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
+; SSE2-LABEL: test_fminimumnum_scalarize:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    minpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_scalarize:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vminpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_scalarize:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_scalarize:
+; X86:       # %bb.0:
+; X86-NEXT:    vminpd %xmm1, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y)
+  ret <2 x double> %r
+}
+
+define float @test_fminimumnum_nan0(float %x, float %y) {
+; SSE2-LABEL: test_fminimumnum_nan0:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_nan0:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_nan0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovaps %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_nan0:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.minimumnum.f32(float 0x7fff000000000000, float %y)
+  ret float %1
+}
+
+define float @test_fminimumnum_nan1(float %x, float %y) {
+; SSE2-LABEL: test_fminimumnum_nan1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_nan1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_nan1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_nan1:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.minimumnum.f32(float %x, float 0x7fff000000000000)
+  ret float %1
+}
+
+define double @test_fminimumnum_nnan(double %x, double %y) "no-nans-fp-math"="true" nounwind {
+; SSE2-LABEL: test_fminimumnum_nnan:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %xmm0, %rax
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    js .LBB14_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    minsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+; SSE2-NEXT:  .LBB14_1:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    minsd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fminimumnum_nnan:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    testq %rax, %rax
+; AVX1-NEXT:    js .LBB14_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+; AVX1-NEXT:  .LBB14_1:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:    vminsd %xmm2, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512F-LABEL: test_fminimumnum_nnan:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    testq %rax, %rax
+; AVX512F-NEXT:    sets %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovapd %xmm1, %xmm2
+; AVX512F-NEXT:    vmovsd %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512F-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512F-NEXT:    vminsd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_fminimumnum_nnan:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vfpclasssd $5, %xmm1, %k0 # k0 = isQuietNaN(xmm1) | isNegativeZero(xmm1)
+; AVX512DQ-NEXT:    kmovw %k0, %k1
+; AVX512DQ-NEXT:    vmovapd %xmm0, %xmm2
+; AVX512DQ-NEXT:    vmovsd %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512DQ-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512DQ-NEXT:    vminsd %xmm2, %xmm1, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_nnan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxsd $16, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_nnan:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vextractps $1, %xmm0, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB14_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovapd %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB14_3
+; X86-NEXT:  .LBB14_1:
+; X86-NEXT:    vmovapd %xmm0, %xmm2
+; X86-NEXT:    vmovapd %xmm1, %xmm0
+; X86-NEXT:  .LBB14_3:
+; X86-NEXT:    vminsd %xmm2, %xmm0, %xmm0
+; X86-NEXT:    vmovsd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.minimumnum.f64(double %x, double %y)
+  ret double %1
+}
+
+define double @test_fminimumnum_zero0(double %x, double %y) nounwind {
+; SSE2-LABEL: test_fminimumnum_zero0:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    cmpordsd %xmm1, %xmm0
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    andpd %xmm1, %xmm2
+; SSE2-NEXT:    minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    andnpd %xmm1, %xmm0
+; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fminimumnum_zero0:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpordsd %xmm1, %xmm1, %xmm0
+; AVX1-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fminimumnum_zero0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpordsd %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_zero0:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxsd $16, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_zero0:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmovlpd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.minimumnum.f64(double -0.0, double %y)
+  ret double %1
+}
+
+define double @test_fminimumnum_zero1(double %x, double %y) nounwind {
+; SSE2-LABEL: test_fminimumnum_zero1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm0, %xmm1
+; SSE2-NEXT:    cmpordsd %xmm0, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm2
+; SSE2-NEXT:    andpd %xmm0, %xmm2
+; SSE2-NEXT:    minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    andnpd %xmm0, %xmm1
+; SSE2-NEXT:    orpd %xmm2, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fminimumnum_zero1:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm1
+; AVX1-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fminimumnum_zero1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vcmpordsd %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmovapd %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_zero1:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxsd $16, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_zero1:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmovlpd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.minimumnum.f64(double %x, double -0.0)
+  ret double %1
+}
+
+define double @test_fminimumnum_zero2(double %x, double %y) {
+; SSE2-LABEL: test_fminimumnum_zero2:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_zero2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_zero2:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_zero2:
+; X86:       # %bb.0:
+; X86-NEXT:    fldz
+; X86-NEXT:    fchs
+; X86-NEXT:    retl
+  %1 = tail call double @llvm.minimumnum.f64(double -0.0, double 0.0)
+  ret double %1
+}
+
+define float @test_fminimumnum_nsz(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fminimumnum_nsz:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    cmpordss %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm3
+; SSE2-NEXT:    andps %xmm0, %xmm3
+; SSE2-NEXT:    minss %xmm1, %xmm0
+; SSE2-NEXT:    andnps %xmm0, %xmm2
+; SSE2-NEXT:    orps %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fminimumnum_nsz:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fminimumnum_nsz:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_nsz:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxss $16, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_nsz:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vminss {{[0-9]+}}(%esp), %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = tail call nsz float @llvm.minimumnum.f32(float %x, float %y)
+  ret float %1
+}
+
+define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fminimumnum_combine_cmps:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    divss %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movaps %xmm1, %xmm3
+; SSE2-NEXT:    js .LBB19_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:  .LBB19_2:
+; SSE2-NEXT:    movaps %xmm3, %xmm2
+; SSE2-NEXT:    cmpordss %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm4
+; SSE2-NEXT:    andps %xmm3, %xmm4
+; SSE2-NEXT:    js .LBB19_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:  .LBB19_4:
+; SSE2-NEXT:    minss %xmm0, %xmm3
+; SSE2-NEXT:    andnps %xmm3, %xmm2
+; SSE2-NEXT:    orps %xmm4, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fminimumnum_combine_cmps:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vdivss %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB19_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmovaps %xmm2, %xmm1
+; AVX1-NEXT:    jmp .LBB19_3
+; AVX1-NEXT:  .LBB19_1:
+; AVX1-NEXT:    vmovaps %xmm0, %xmm1
+; AVX1-NEXT:    vmovaps %xmm2, %xmm0
+; AVX1-NEXT:  .LBB19_3:
+; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512F-LABEL: test_fminimumnum_combine_cmps:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    testl %eax, %eax
+; AVX512F-NEXT:    sets %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovaps %xmm1, %xmm2
+; AVX512F-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512F-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512F-NEXT:    vminss %xmm2, %xmm0, %xmm1
+; AVX512F-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512F-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_fminimumnum_combine_cmps:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vfpclassss $5, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isNegativeZero(xmm0)
+; AVX512DQ-NEXT:    kmovw %k0, %k1
+; AVX512DQ-NEXT:    vmovaps %xmm1, %xmm2
+; AVX512DQ-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512DQ-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512DQ-NEXT:    vminss %xmm2, %xmm0, %xmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_combine_cmps:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vdivss %xmm0, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxss $16, %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_combine_cmps:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vdivss %xmm0, %xmm1, %xmm2
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB19_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovaps %xmm2, %xmm1
+; X86-NEXT:    jmp .LBB19_3
+; X86-NEXT:  .LBB19_1:
+; X86-NEXT:    vmovaps %xmm0, %xmm1
+; X86-NEXT:    vmovaps %xmm2, %xmm0
+; X86-NEXT:  .LBB19_3:
+; X86-NEXT:    vminss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %1 = fdiv nnan float %y, %x
+  %2 = tail call float @llvm.minimumnum.f32(float %x, float %1)
+  ret float %2
+}
+
+define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) {
+; SSE2-LABEL: test_fminimumnum_vector:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    minpd %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    cmpordpd %xmm3, %xmm0
+; SSE2-NEXT:    andpd %xmm0, %xmm3
+; SSE2-NEXT:    andnpd %xmm1, %xmm0
+; SSE2-NEXT:    orpd %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vminpd %xmm2, %xmm0, %xmm1
+; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector:
+; X86:       # %bb.0:
+; X86-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
+; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vminpd %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y)
+  ret <2 x double> %r
+}
+
+define <4 x float> @test_fmaximumnum_vector(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
+; SSE2-LABEL: test_fmaximumnum_vector:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    maxps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_vector:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_vector:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_vector:
+; X86:       # %bb.0:
+; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %y)
+  ret <4 x float> %r
+}
+
+define <2 x double> @test_fminimumnum_vector_zero(<2 x double> %x) {
+; SSE2-LABEL: test_fminimumnum_vector_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    minpd %xmm0, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0.>)
+  ret <2 x double> %r
+}
+
+define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) {
+; SSE2-LABEL: test_fmaximumnum_vector_signed_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE2-NEXT:    maxps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_vector_signed_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $17, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_vector_signed_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float -0., float -0., float -0., float -0.>)
+  ret <4 x float> %r
+}
+
+define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) {
+; SSE2-LABEL: test_fminimumnum_vector_partially_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT:    minpd %xmm0, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector_partially_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector_partially_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector_partially_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 5.>)
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) {
+; SSE2-LABEL: test_fminimumnum_vector_different_zeros:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    andps %xmm3, %xmm4
+; SSE2-NEXT:    orps %xmm1, %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    minpd %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    cmpordpd %xmm3, %xmm0
+; SSE2-NEXT:    andpd %xmm0, %xmm3
+; SSE2-NEXT:    andnpd %xmm1, %xmm0
+; SSE2-NEXT:    orpd %xmm3, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector_different_zeros:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vminpd %xmm2, %xmm0, %xmm1
+; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector_different_zeros:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector_different_zeros:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; X86-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
+; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vminpd %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double -0.>)
+  ret <2 x double> %r
+}
+
+define <4 x float> @test_fmaximumnum_vector_non_zero(<4 x float> %x) {
+; SSE2-LABEL: test_fmaximumnum_vector_non_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0]
+; SSE2-NEXT:    maxps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_vector_non_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0]
+; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_vector_non_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $17, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_vector_non_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0]
+; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float 5., float 4., float 3., float 2.>)
+  ret <4 x float> %r
+}
+
+define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) {
+; SSE2-LABEL: test_fminimumnum_vector_nan:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorpd %xmm2, %xmm2
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT:    minpd %xmm0, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector_nan:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm1[0],mem[0]
+; AVX-NEXT:    vminpd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector_nan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector_nan:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vcmpordpd %xmm1, %xmm1, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0x7fff000000000000>)
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_fminimumnum_vector_zero_first(<2 x double> %x) {
+; SSE2-LABEL: test_fminimumnum_vector_zero_first:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    minpd %xmm0, %xmm1
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector_zero_first:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector_zero_first:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector_zero_first:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> <double 0., double 0.>, <2 x double> %x)
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_fminimumnum_vector_signed_zero(<2 x double> %x) {
+; SSE2-LABEL: test_fminimumnum_vector_signed_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movapd %xmm0, %xmm1
+; SSE2-NEXT:    minpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    cmpordpd %xmm0, %xmm2
+; SSE2-NEXT:    andpd %xmm2, %xmm0
+; SSE2-NEXT:    andnpd %xmm1, %xmm2
+; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_vector_signed_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm1
+; AVX-NEXT:    vminpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_vector_signed_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxpd $16, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_vector_signed_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vminpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    retl
+  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double -0., double -0.>)
+  ret <2 x double> %r
+}
+
+define <4 x float> @test_fmaximumnum_vector_signed_zero_first(<4 x float> %x) {
+; SSE2-LABEL: test_fmaximumnum_vector_signed_zero_first:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE2-NEXT:    maxps %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_vector_signed_zero_first:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero_first:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxps $17, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_vector_signed_zero_first:
+; X86:       # %bb.0:
+; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> <float -0., float -0., float -0., float -0.>, <4 x float> %x)
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_fmaximumnum_vector_zero(<4 x float> %x) {
+; SSE2-LABEL: test_fmaximumnum_vector_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    maxps %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    cmpordps %xmm0, %xmm1
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    andnps %xmm2, %xmm1
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_vector_zero:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_vector_zero:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxps $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_vector_zero:
+; X86:       # %bb.0:
+; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float 0., float 0., float 0., float 0.>)
+  ret <4 x float> %r
+}
+
+; PR77805: Check that signed zeroes are handled correctly in this case (FIXME)
+define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) {
+; SSE2-LABEL: test_fmaximumnum_v4f32_splat:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    movaps %xmm1, %xmm4
+; SSE2-NEXT:    andps %xmm2, %xmm4
+; SSE2-NEXT:    orps %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    andnps %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    maxps %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    cmpordps %xmm0, %xmm2
+; SSE2-NEXT:    andps %xmm2, %xmm0
+; SSE2-NEXT:    andnps %xmm1, %xmm2
+; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_v4f32_splat:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum_v4f32_splat:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vbroadcastss %xmm1, %xmm1
+; AVX512-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
+; AVX512-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; AVX512-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_v4f32_splat:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vbroadcastss %xmm1, %xmm1
+; AVX10_2-NEXT:    vminmaxps $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_v4f32_splat:
+; X86:       # %bb.0:
+; X86-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm1
+; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    retl
+  %splatinsert = insertelement <4 x float> poison, float %y, i64 0
+  %vec = shufflevector <4 x float> %splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %vec) readnone
+  ret <4 x float> %r
+}
+
+define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_v4f16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    subq $104, %rsp
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    js .LBB33_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:  .LBB33_2:
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    cmpordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andps %xmm2, %xmm3
+; SSE2-NEXT:    js .LBB33_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:  .LBB33_4:
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    maxss %xmm4, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2@PLT
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    js .LBB33_6
+; SSE2-NEXT:  # %bb.5:
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:  .LBB33_6:
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    cmpordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andps %xmm2, %xmm3
+; SSE2-NEXT:    js .LBB33_8
+; SSE2-NEXT:  # %bb.7:
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:  .LBB33_8:
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    psrlq $48, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    psrlq $48, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    maxss %xmm4, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2@PLT
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    js .LBB33_10
+; SSE2-NEXT:  # %bb.9:
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:  .LBB33_10:
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    cmpordss %xmm2, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm3
+; SSE2-NEXT:    andps %xmm2, %xmm3
+; SSE2-NEXT:    js .LBB33_12
+; SSE2-NEXT:  # %bb.11:
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:  .LBB33_12:
+; SSE2-NEXT:    maxss %xmm4, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm1
+; SSE2-NEXT:    orps %xmm3, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2@PLT
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    movd (%rsp), %xmm4 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    js .LBB33_14
+; SSE2-NEXT:  # %bb.13:
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:  .LBB33_14:
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    cmpordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andps %xmm2, %xmm3
+; SSE2-NEXT:    js .LBB33_16
+; SSE2-NEXT:  # %bb.15:
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:  .LBB33_16:
+; SSE2-NEXT:    maxss %xmm4, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2@PLT
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    addq $104, %rsp
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_v4f16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    subq $120, %rsp
+; AVX1-NEXT:    vmovaps %xmm0, %xmm2
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm0
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB33_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    jmp .LBB33_3
+; AVX1-NEXT:  .LBB33_1:
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:  .LBB33_3:
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    callq __truncsfhf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB33_4
+; AVX1-NEXT:  # %bb.5:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    jmp .LBB33_6
+; AVX1-NEXT:  .LBB33_4:
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:  .LBB33_6:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    callq __truncsfhf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB33_7
+; AVX1-NEXT:  # %bb.8:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    jmp .LBB33_9
+; AVX1-NEXT:  .LBB33_7:
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:  .LBB33_9:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    callq __truncsfhf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    callq __extendhfsf2@PLT
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    js .LBB33_10
+; AVX1-NEXT:  # %bb.11:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    jmp .LBB33_12
+; AVX1-NEXT:  .LBB33_10:
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX1-NEXT:  .LBB33_12:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    callq __truncsfhf2@PLT
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
+; AVX1-NEXT:    addq $120, %rsp
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum_v4f16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    subq $88, %rsp
+; AVX512-NEXT:    vmovdqa %xmm1, %xmm4
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm6
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm6[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm2
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
+; AVX512-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm2
+; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm2
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX512-NEXT:    vmovd %eax, %xmm2
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm9
+; AVX512-NEXT:    vmulss %xmm0, %xmm9, %xmm0
+; AVX512-NEXT:    vxorps %xmm10, %xmm10, %xmm10
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm2
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT:    vucomiss %xmm3, %xmm3
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm2, %xmm3, %xmm3 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm1
+; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm3
+; AVX512-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
+; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm3
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vmulss %xmm2, %xmm9, %xmm2
+; AVX512-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT:    vucomiss %xmm3, %xmm3
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm3, %xmm3 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm1
+; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm5
+; AVX512-NEXT:    vmovss %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm3
+; AVX512-NEXT:    vucomiss %xmm3, %xmm5
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm5, %xmm3, %xmm3 {%k1}
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm4[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm5 = xmm6[1,0]
+; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT:    vucomiss %xmm5, %xmm5
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm5, %xmm5 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm5, %xmm15
+; AVX512-NEXT:    vcvtph2ps %xmm15, %xmm5
+; AVX512-NEXT:    vmovss %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm5
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT:    vmulss %xmm3, %xmm9, %xmm3
+; AVX512-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm1
+; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmulss %xmm0, %xmm9, %xmm0
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vmovd %xmm0, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm6[3,3,3,3,4,5,6,7]
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm2
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm11
+; AVX512-NEXT:    vcvtph2ps %xmm11, %xmm3
+; AVX512-NEXT:    vmovss %xmm3, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm3
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT:    vucomiss %xmm3, %xmm3
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm3, %xmm3 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm7
+; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm3
+; AVX512-NEXT:    vmovss %xmm3, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm12
+; AVX512-NEXT:    vcvtph2ps %xmm12, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm3
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm3, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vmulss %xmm2, %xmm9, %xmm2
+; AVX512-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm14
+; AVX512-NEXT:    vmovd %xmm14, %eax
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmulss %xmm0, %xmm9, %xmm0
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm13
+; AVX512-NEXT:    vmovd %xmm13, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm0
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm2
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm3
+; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm1
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm8
+; AVX512-NEXT:    vcvtph2ps %xmm8, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[1,1,1,1,4,5,6,7]
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT:    vucomiss %xmm1, %xmm1
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm6[1,1,1,1,4,5,6,7]
+; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT:    vucomiss %xmm4, %xmm4
+; AVX512-NEXT:    setp %al
+; AVX512-NEXT:    kmovw %eax, %k2
+; AVX512-NEXT:    vmovss %xmm1, %xmm4, %xmm4 {%k2}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm6
+; AVX512-NEXT:    vmovss %xmm6, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm0
+; AVX512-NEXT:    vucomiss %xmm0, %xmm6
+; AVX512-NEXT:    seta %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    vmovss %xmm6, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT:    vmulss %xmm2, %xmm9, %xmm2
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vmulss %xmm0, %xmm9, %xmm0
+; AVX512-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3]
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
+; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm9
+; AVX512-NEXT:    vmovd %xmm9, %eax
+; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm10
+; AVX512-NEXT:    vmovd %xmm10, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX512-NEXT:    # xmm6 = xmm0[0],mem[0]
+; AVX512-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm0, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm2, %eax
+; AVX512-NEXT:    vmovd %xmm15, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX512-NEXT:    vmovd %xmm11, %eax
+; AVX512-NEXT:    vmovd %xmm7, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512-NEXT:    vmovd %xmm3, %eax
+; AVX512-NEXT:    vmovd %xmm4, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm3
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpeqw %xmm0, %xmm2, %xmm3
+; AVX512-NEXT:    vpblendvb %xmm3, %xmm2, %xmm6, %xmm2
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm3, %eax
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm3, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm3
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm4, %eax
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm4, %ecx
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
+; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512-NEXT:    vmovd %xmm4, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
+; AVX512-NEXT:    vmovd %xmm12, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-NEXT:    vmovd %xmm8, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
+; AVX512-NEXT:    vmovd %xmm1, %eax
+; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX512-NEXT:    vpcmpeqw %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm10, %xmm1
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $65535, %ecx # imm = 0xFFFF
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovel %ecx, %edx
+; AVX512-NEXT:    vcvtph2ps %xmm9, %xmm1
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $0, %esi
+; AVX512-NEXT:    cmovel %ecx, %esi
+; AVX512-NEXT:    vcvtph2ps %xmm13, %xmm1
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $0, %edi
+; AVX512-NEXT:    cmovel %ecx, %edi
+; AVX512-NEXT:    vcvtph2ps %xmm14, %xmm1
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $0, %r8d
+; AVX512-NEXT:    cmovel %ecx, %r8d
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $0, %r9d
+; AVX512-NEXT:    cmovel %ecx, %r9d
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $0, %r10d
+; AVX512-NEXT:    cmovel %ecx, %r10d
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    movl $0, %r11d
+; AVX512-NEXT:    cmovel %ecx, %r11d
+; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT:    vucomiss %xmm2, %xmm1
+; AVX512-NEXT:    vmovd %esi, %xmm1
+; AVX512-NEXT:    vpinsrw $1, %edx, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $2, %edi, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $3, %r8d, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $4, %r9d, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $5, %r10d, %xmm1, %xmm1
+; AVX512-NEXT:    vpinsrw $6, %r11d, %xmm1, %xmm1
+; AVX512-NEXT:    cmovel %ecx, %eax
+; AVX512-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendvb %xmm1, %xmm0, %xmm6, %xmm0
+; AVX512-NEXT:    addq $88, %rsp
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_v4f16:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxph $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_v4f16:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $164, %esp
+; X86-NEXT:    vmovdqa %xmm0, %xmm2
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vpsrlq $48, %xmm1, %xmm0
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vpsrld $16, %xmm2, %xmm0
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vpsrld $16, %xmm1, %xmm0
+; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vpextrw $0, %xmm1, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm2, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB33_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovdqa %xmm2, %xmm1
+; X86-NEXT:    jmp .LBB33_3
+; X86-NEXT:  .LBB33_1:
+; X86-NEXT:    vmovdqa %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm2, %xmm0
+; X86-NEXT:  .LBB33_3:
+; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm1, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB33_4
+; X86-NEXT:  # %bb.5:
+; X86-NEXT:    vmovdqa %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB33_6
+; X86-NEXT:  .LBB33_4:
+; X86-NEXT:    vmovdqa %xmm0, %xmm2
+; X86-NEXT:    vmovdqa %xmm1, %xmm0
+; X86-NEXT:  .LBB33_6:
+; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm1, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB33_7
+; X86-NEXT:  # %bb.8:
+; X86-NEXT:    vmovdqa %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB33_9
+; X86-NEXT:  .LBB33_7:
+; X86-NEXT:    vmovdqa %xmm0, %xmm2
+; X86-NEXT:    vmovdqa %xmm1, %xmm0
+; X86-NEXT:  .LBB33_9:
+; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __extendhfsf2
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovd %xmm1, %eax
+; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    js .LBB33_10
+; X86-NEXT:  # %bb.11:
+; X86-NEXT:    vmovdqa %xmm1, %xmm2
+; X86-NEXT:    jmp .LBB33_12
+; X86-NEXT:  .LBB33_10:
+; X86-NEXT:    vmovdqa %xmm0, %xmm2
+; X86-NEXT:    vmovdqa %xmm1, %xmm0
+; X86-NEXT:  .LBB33_12:
+; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovd %xmm0, (%esp)
+; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; X86-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
+; X86-NEXT:    addl $164, %esp
+; X86-NEXT:    retl
+  %r = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> %x, <4 x half> %y)
+  ret <4 x half> %r
+}
+
+define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_v4bf16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    subq $56, %rsp
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pextrw $0, %xmm1, %r14d
+; SSE2-NEXT:    pextrw $0, %xmm0, %r15d
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    movdqa %xmm5, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pextrw $0, %xmm0, %ecx
+; SSE2-NEXT:    shll $16, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    testl %ecx, %ecx
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    js .LBB34_2
+; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:  .LBB34_2:
+; SSE2-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1,1,1]
+; SSE2-NEXT:    movdqa %xmm5, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1,1,1]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    cmpordss %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm6
+; SSE2-NEXT:    andps %xmm1, %xmm6
+; SSE2-NEXT:    js .LBB34_4
+; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:  .LBB34_4:
+; SSE2-NEXT:    pextrw $0, %xmm4, %ebp
+; SSE2-NEXT:    pextrw $0, %xmm5, %ebx
+; SSE2-NEXT:    maxss %xmm2, %xmm1
+; SSE2-NEXT:    andnps %xmm1, %xmm0
+; SSE2-NEXT:    orps %xmm6, %xmm0
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    shll $16, %r15d
+; SSE2-NEXT:    movd %r15d, %xmm3
+; SSE2-NEXT:    shll $16, %r14d
+; SSE2-NEXT:    movd %r14d, %xmm2
+; SSE2-NEXT:    testl %r15d, %r15d
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    js .LBB34_6
+; SSE2-NEXT:  # %bb.5:
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:  .LBB34_6:
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; SSE2-NEXT:    psrlq $48, %xmm5
+; SSE2-NEXT:    movdqa (%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT:    psrlq $48, %xmm6
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    cmpordss %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    andps %xmm1, %xmm4
+; SSE2-NEXT:    js .LBB34_8
+; SSE2-NEXT:  # %bb.7:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:  .LBB34_8:
+; SSE2-NEXT:    pextrw $0, %xmm5, %r15d
+; SSE2-NEXT:    pextrw $0, %xmm6, %r14d
+; SSE2-NEXT:    maxss %xmm2, %xmm1
+; SSE2-NEXT:    andnps %xmm1, %xmm0
+; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd %ebx, %xmm1
+; SSE2-NEXT:    shll $16, %ebp
+; SSE2-NEXT:    movd %ebp, %xmm3
+; SSE2-NEXT:    testl %ebx, %ebx
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    js .LBB34_10
+; SSE2-NEXT:  # %bb.9:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:  .LBB34_10:
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    cmpordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    andps %xmm2, %xmm4
+; SSE2-NEXT:    js .LBB34_12
+; SSE2-NEXT:  # %bb.11:
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:  .LBB34_12:
+; SSE2-NEXT:    maxss %xmm3, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    shll $16, %r14d
+; SSE2-NEXT:    movd %r14d, %xmm1
+; SSE2-NEXT:    shll $16, %r15d
+; SSE2-NEXT:    movd %r15d, %xmm3
+; SSE2-NEXT:    testl %r14d, %r14d
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    js .LBB34_14
+; SSE2-NEXT:  # %bb.13:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:  .LBB34_14:
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    cmpordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    andps %xmm2, %xmm4
+; SSE2-NEXT:    js .LBB34_16
+; SSE2-NEXT:  # %bb.15:
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:  .LBB34_16:
+; SSE2-NEXT:    maxss %xmm3, %xmm2
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    addq $56, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX1-LABEL: test_fmaximumnum_v4bf16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    pushq %rbp
+; AVX1-NEXT:    pushq %r15
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    pushq %r13
+; AVX1-NEXT:    pushq %r12
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    subq $56, %rsp
+; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm2
+; AVX1-NEXT:    vpsrlq $48, %xmm1, %xmm3
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpextrw $0, %xmm4, %ebx
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpextrw $0, %xmm4, %ebp
+; AVX1-NEXT:    vpextrw $0, %xmm0, %r12d
+; AVX1-NEXT:    vpextrw $0, %xmm1, %r13d
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm0
+; AVX1-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX1-NEXT:    shll $16, %ecx
+; AVX1-NEXT:    vmovd %ecx, %xmm0
+; AVX1-NEXT:    shll $16, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm4
+; AVX1-NEXT:    js .LBB34_1
+; AVX1-NEXT:  # %bb.2:
+; AVX1-NEXT:    vmovdqa %xmm4, %xmm1
+; AVX1-NEXT:    jmp .LBB34_3
+; AVX1-NEXT:  .LBB34_1:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa %xmm4, %xmm0
+; AVX1-NEXT:  .LBB34_3:
+; AVX1-NEXT:    vpextrw $0, %xmm2, %r14d
+; AVX1-NEXT:    vpextrw $0, %xmm3, %r15d
+; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    callq __truncsfbf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    shll $16, %r13d
+; AVX1-NEXT:    vmovd %r13d, %xmm0
+; AVX1-NEXT:    shll $16, %r12d
+; AVX1-NEXT:    vmovd %r12d, %xmm2
+; AVX1-NEXT:    js .LBB34_4
+; AVX1-NEXT:  # %bb.5:
+; AVX1-NEXT:    vmovdqa %xmm2, %xmm1
+; AVX1-NEXT:    jmp .LBB34_6
+; AVX1-NEXT:  .LBB34_4:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX1-NEXT:  .LBB34_6:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    callq __truncsfbf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    shll $16, %ebp
+; AVX1-NEXT:    vmovd %ebp, %xmm0
+; AVX1-NEXT:    shll $16, %ebx
+; AVX1-NEXT:    vmovd %ebx, %xmm2
+; AVX1-NEXT:    js .LBB34_7
+; AVX1-NEXT:  # %bb.8:
+; AVX1-NEXT:    vmovdqa %xmm2, %xmm1
+; AVX1-NEXT:    jmp .LBB34_9
+; AVX1-NEXT:  .LBB34_7:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX1-NEXT:  .LBB34_9:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    callq __truncsfbf2@PLT
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT:    shll $16, %r15d
+; AVX1-NEXT:    vmovd %r15d, %xmm0
+; AVX1-NEXT:    shll $16, %r14d
+; AVX1-NEXT:    vmovd %r14d, %xmm2
+; AVX1-NEXT:    js .LBB34_10
+; AVX1-NEXT:  # %bb.11:
+; AVX1-NEXT:    vmovdqa %xmm2, %xmm1
+; AVX1-NEXT:    jmp .LBB34_12
+; AVX1-NEXT:  .LBB34_10:
+; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
+; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX1-NEXT:  .LBB34_12:
+; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    callq __truncsfbf2@PLT
+; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
+; AVX1-NEXT:    addq $56, %rsp
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    popq %r12
+; AVX1-NEXT:    popq %r13
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    popq %r15
+; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: test_fmaximumnum_v4bf16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    pushq %rax
+; AVX512-NEXT:    vmovq %xmm1, %r13
+; AVX512-NEXT:    movq %r13, %rbx
+; AVX512-NEXT:    shrq $32, %rbx
+; AVX512-NEXT:    vmovq %xmm0, %rbp
+; AVX512-NEXT:    movq %rbp, %r14
+; AVX512-NEXT:    shrq $32, %r14
+; AVX512-NEXT:    movq %r13, %r15
+; AVX512-NEXT:    shrq $48, %r15
+; AVX512-NEXT:    movq %rbp, %r12
+; AVX512-NEXT:    shrq $48, %r12
+; AVX512-NEXT:    movl %ebp, %eax
+; AVX512-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
+; AVX512-NEXT:    sets %cl
+; AVX512-NEXT:    kmovw %ecx, %k1
+; AVX512-NEXT:    movl %r13d, %ecx
+; AVX512-NEXT:    andl $-65536, %ecx # imm = 0xFFFF0000
+; AVX512-NEXT:    vmovd %ecx, %xmm1
+; AVX512-NEXT:    vmovd %eax, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    callq __truncsfbf2@PLT
+; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    shll $16, %ebp
+; AVX512-NEXT:    sets %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    shll $16, %r13d
+; AVX512-NEXT:    vmovd %r13d, %xmm1
+; AVX512-NEXT:    vmovd %ebp, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    callq __truncsfbf2@PLT
+; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsp)
+; AVX512-NEXT:    shll $16, %r12d
+; AVX512-NEXT:    sets %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    shll $16, %r15d
+; AVX512-NEXT:    vmovd %r15d, %xmm1
+; AVX512-NEXT:    vmovd %r12d, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    callq __truncsfbf2@PLT
+; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    shll $16, %r14d
+; AVX512-NEXT:    sets %al
+; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    shll $16, %ebx
+; AVX512-NEXT:    vmovd %ebx, %xmm1
+; AVX512-NEXT:    vmovd %r14d, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
+; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT:    callq __truncsfbf2@PLT
+; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0
+; AVX512-NEXT:    addq $8, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_v4bf16:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vminmaxnepbf16 $17, %xmm1, %xmm0, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_v4bf16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $68, %esp
+; X86-NEXT:    vpsrlq $48, %xmm0, %xmm2
+; X86-NEXT:    vpsrlq $48, %xmm1, %xmm3
+; X86-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; X86-NEXT:    vpextrw $0, %xmm4, %esi
+; X86-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; X86-NEXT:    vpextrw $0, %xmm4, %ebx
+; X86-NEXT:    vpextrw $0, %xmm0, %eax
+; X86-NEXT:    vpextrw $0, %xmm1, %ecx
+; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
+; X86-NEXT:    vpextrw $0, %xmm0, %edx
+; X86-NEXT:    vpsrld $16, %xmm1, %xmm0
+; X86-NEXT:    vpextrw $0, %xmm0, %edi
+; X86-NEXT:    shll $16, %edi
+; X86-NEXT:    vmovd %edi, %xmm0
+; X86-NEXT:    shll $16, %edx
+; X86-NEXT:    vmovd %edx, %xmm4
+; X86-NEXT:    js .LBB34_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    vmovdqa %xmm4, %xmm1
+; X86-NEXT:    jmp .LBB34_3
+; X86-NEXT:  .LBB34_1:
+; X86-NEXT:    vmovdqa %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm4, %xmm0
+; X86-NEXT:  .LBB34_3:
+; X86-NEXT:    vpextrw $0, %xmm2, %edi
+; X86-NEXT:    vpextrw $0, %xmm3, %ebp
+; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    vmovd %ecx, %xmm0
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm2
+; X86-NEXT:    js .LBB34_4
+; X86-NEXT:  # %bb.5:
+; X86-NEXT:    vmovdqa %xmm2, %xmm1
+; X86-NEXT:    jmp .LBB34_6
+; X86-NEXT:  .LBB34_4:
+; X86-NEXT:    vmovdqa %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm2, %xmm0
+; X86-NEXT:  .LBB34_6:
+; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    shll $16, %ebx
+; X86-NEXT:    vmovd %ebx, %xmm0
+; X86-NEXT:    shll $16, %esi
+; X86-NEXT:    vmovd %esi, %xmm2
+; X86-NEXT:    js .LBB34_7
+; X86-NEXT:  # %bb.8:
+; X86-NEXT:    vmovdqa %xmm2, %xmm1
+; X86-NEXT:    jmp .LBB34_9
+; X86-NEXT:  .LBB34_7:
+; X86-NEXT:    vmovdqa %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm2, %xmm0
+; X86-NEXT:  .LBB34_9:
+; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    shll $16, %ebp
+; X86-NEXT:    vmovd %ebp, %xmm0
+; X86-NEXT:    shll $16, %edi
+; X86-NEXT:    vmovd %edi, %xmm2
+; X86-NEXT:    js .LBB34_10
+; X86-NEXT:  # %bb.11:
+; X86-NEXT:    vmovdqa %xmm2, %xmm1
+; X86-NEXT:    jmp .LBB34_12
+; X86-NEXT:  .LBB34_10:
+; X86-NEXT:    vmovdqa %xmm0, %xmm1
+; X86-NEXT:    vmovdqa %xmm2, %xmm0
+; X86-NEXT:  .LBB34_12:
+; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT:    vmovd %xmm0, (%esp)
+; X86-NEXT:    calll __truncsfbf2
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; X86-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
+; X86-NEXT:    addl $68, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %r = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
+  ret <4 x bfloat> %r
+}

From 632e6c8527e4e0d90c39d94a2ce75af2630c5268 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 6 Jan 2025 13:32:55 +0000
Subject: [PATCH 39/49] [CostModel][X86] getShuffleCost - use
 processShuffleMasks for all shuffle kinds to legal types (#120599) (#121760)

Now that processShuffleMasks can correctly handle 2 src shuffles, we can completely remove the shuffle kind limits and correctly recognize the number of active subvectors per legalized shuffle - improveShuffleKindFromMask will determine the shuffle kind for each split subvector.
---
 .../lib/Target/X86/X86TargetTransformInfo.cpp |   3 +-
 .../CostModel/X86/alternate-shuffle-cost.ll   |  20 +--
 .../X86/shuffle-insert_subvector-codesize.ll  | 122 ++++++++--------
 .../X86/shuffle-insert_subvector-latency.ll   | 122 ++++++++--------
 .../shuffle-insert_subvector-sizelatency.ll   | 122 ++++++++--------
 .../CostModel/X86/shuffle-insert_subvector.ll | 122 ++++++++--------
 .../CostModel/X86/shuffle-select-codesize.ll  | 138 +++++++++---------
 .../CostModel/X86/shuffle-select-latency.ll   | 138 +++++++++---------
 .../X86/shuffle-select-sizelatency.ll         | 138 +++++++++---------
 .../Analysis/CostModel/X86/shuffle-select.ll  | 138 +++++++++---------
 .../CostModel/X86/shuffle-splice-codesize.ll  |   6 +-
 .../CostModel/X86/shuffle-splice-latency.ll   |   6 +-
 .../X86/shuffle-splice-sizelatency.ll         |   6 +-
 .../Analysis/CostModel/X86/shuffle-splice.ll  |   6 +-
 .../test/Transforms/PhaseOrdering/X86/hadd.ll |  70 +++++----
 .../test/Transforms/PhaseOrdering/X86/hsub.ll |  38 +++--
 .../VectorCombine/X86/extract-fneg-insert.ll  |  17 +--
 17 files changed, 606 insertions(+), 606 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 808f48eb92a61..5d07507457eec 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1698,8 +1698,7 @@ InstructionCost X86TTIImpl::getShuffleCost(
   // We are going to permute multiple sources and the result will be in multiple
   // destinations. Providing an accurate cost only for splits where the element
   // type remains the same.
-  if ((Kind == TTI::SK_PermuteSingleSrc || Kind == TTI::SK_PermuteTwoSrc) &&
-      LT.first != 1) {
+  if (LT.first != 1) {
     MVT LegalVT = LT.second;
     if (LegalVT.isVector() &&
         LegalVT.getVectorElementType().getSizeInBits() ==
diff --git a/llvm/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll b/llvm/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll
index 20d27364b2f26..21a92c4559171 100644
--- a/llvm/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll
@@ -294,13 +294,9 @@ define <4 x i64> @test_v4i64_2(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <4 x i64> @test_v4i64_3(<4 x i64> %a, <4 x i64> %b) {
-; SSE-LABEL: 'test_v4i64_3'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %1
-;
-; AVX-LABEL: 'test_v4i64_3'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %1
+; CHECK-LABEL: 'test_v4i64_3'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %1
 ;
   %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x i64> %1
@@ -333,13 +329,9 @@ define <4 x double> @test_v4f64_2(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @test_v4f64_3(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: 'test_v4f64_3'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %1
-;
-; AVX-LABEL: 'test_v4f64_3'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %1
+; CHECK-LABEL: 'test_v4f64_3'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x double> %1
 ;
   %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
   ret <4 x double> %1
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll
index 4e4235198b45e..bcba890867ed2 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll
@@ -21,13 +21,13 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -37,11 +37,11 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -79,13 +79,13 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -95,11 +95,11 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -142,13 +142,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -156,11 +156,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -173,13 +173,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -187,11 +187,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -204,13 +204,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -218,11 +218,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -241,7 +241,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -249,11 +249,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -272,7 +272,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -280,11 +280,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -363,13 +363,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -377,11 +377,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -394,13 +394,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -408,11 +408,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -425,13 +425,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -439,11 +439,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -462,7 +462,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -470,11 +470,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -493,7 +493,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -501,11 +501,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -594,7 +594,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -602,11 +602,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -629,7 +629,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -637,11 +637,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -664,7 +664,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -672,11 +672,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll
index 61978badb34d4..b85d20aea44d9 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll
@@ -21,13 +21,13 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -37,11 +37,11 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -79,13 +79,13 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -95,11 +95,11 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -142,13 +142,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -156,11 +156,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -173,13 +173,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -187,11 +187,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -204,13 +204,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -218,11 +218,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -241,7 +241,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -249,11 +249,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -272,7 +272,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -280,11 +280,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -363,13 +363,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -377,11 +377,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -394,13 +394,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -408,11 +408,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -425,13 +425,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -439,11 +439,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -462,7 +462,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -470,11 +470,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -493,7 +493,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -501,11 +501,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -594,7 +594,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -602,11 +602,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -629,7 +629,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -637,11 +637,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -664,7 +664,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -672,11 +672,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll
index 94e3bc3a610bd..457c34609b922 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll
@@ -21,13 +21,13 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -37,11 +37,11 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -79,13 +79,13 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -95,11 +95,11 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -142,13 +142,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -156,11 +156,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -173,13 +173,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -187,11 +187,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -204,13 +204,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -218,11 +218,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -241,7 +241,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -249,11 +249,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -272,7 +272,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -280,11 +280,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -363,13 +363,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -377,11 +377,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -394,13 +394,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -408,11 +408,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -425,13 +425,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -439,11 +439,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -462,7 +462,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -470,11 +470,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -493,7 +493,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -501,11 +501,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -594,7 +594,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -602,11 +602,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -629,7 +629,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -637,11 +637,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -664,7 +664,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -672,11 +672,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
index fe3e61d23397d..4e90258ca8fb9 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
@@ -21,13 +21,13 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -37,11 +37,11 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -79,13 +79,13 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -95,11 +95,11 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -142,13 +142,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -156,11 +156,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -173,13 +173,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -187,11 +187,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -204,13 +204,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -218,11 +218,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -241,7 +241,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -249,11 +249,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -272,7 +272,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -280,11 +280,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -363,13 +363,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -377,11 +377,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -394,13 +394,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -408,11 +408,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -425,13 +425,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -439,11 +439,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -462,7 +462,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -470,11 +470,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -493,7 +493,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -501,11 +501,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -594,7 +594,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -602,11 +602,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -629,7 +629,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -637,11 +637,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -664,7 +664,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -672,11 +672,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-select-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-select-codesize.ll
index e66bce0a95a6a..0173439be3a5c 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-select-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-select-codesize.ll
@@ -20,30 +20,30 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -56,30 +56,30 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -93,41 +93,41 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-LABEL: 'test_vXf32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -135,7 +135,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -150,41 +150,41 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -192,7 +192,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -208,27 +208,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -236,8 +236,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -245,8 +245,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -254,8 +254,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi16'
@@ -264,7 +264,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
@@ -282,8 +282,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -291,8 +291,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -300,8 +300,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -310,7 +310,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -319,7 +319,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -328,7 +328,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-select-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-select-latency.ll
index 77f0072acaf6d..d45c96cb5bff8 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-select-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-select-latency.ll
@@ -20,30 +20,30 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -56,30 +56,30 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -93,41 +93,41 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-LABEL: 'test_vXf32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -135,7 +135,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -150,41 +150,41 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -192,7 +192,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -208,27 +208,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -236,8 +236,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -245,8 +245,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -254,8 +254,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi16'
@@ -264,7 +264,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
@@ -282,8 +282,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -291,8 +291,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -300,8 +300,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -310,7 +310,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -319,7 +319,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -328,7 +328,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-select-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-select-sizelatency.ll
index d01e011e18307..0f8b32e824622 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-select-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-select-sizelatency.ll
@@ -20,30 +20,30 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -56,30 +56,30 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -93,41 +93,41 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-LABEL: 'test_vXf32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -135,7 +135,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -150,41 +150,41 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -192,7 +192,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -208,27 +208,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -236,8 +236,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -245,8 +245,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -254,8 +254,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi16'
@@ -264,7 +264,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
@@ -282,8 +282,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -291,8 +291,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -300,8 +300,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -310,7 +310,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -319,7 +319,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -328,7 +328,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-select.ll b/llvm/test/Analysis/CostModel/X86/shuffle-select.ll
index 7b882e3a110f1..c483d5d87139d 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-select.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-select.ll
@@ -20,30 +20,30 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -56,30 +56,30 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 0, i32 3>
@@ -93,41 +93,41 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-LABEL: 'test_vXf32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXf32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXf32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -135,7 +135,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -150,41 +150,41 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi32'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -192,7 +192,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 0, i32 3>
@@ -208,27 +208,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -236,8 +236,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -245,8 +245,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -254,8 +254,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi16'
@@ -264,7 +264,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 0, i32 3>
@@ -282,8 +282,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -291,8 +291,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -300,8 +300,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -310,7 +310,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -319,7 +319,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -328,7 +328,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 32, i32 33, i32 34, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splice-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splice-codesize.ll
index f67d681292bec..78c476c668c05 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splice-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splice-codesize.ll
@@ -120,21 +120,21 @@ define void @test_vXi32(<2 x i32> %a64, <2 x i32> %b64, <4 x i32> %a128, <4 x i3
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splice-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splice-latency.ll
index 8b02b82529b39..2433519075c04 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splice-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splice-latency.ll
@@ -120,21 +120,21 @@ define void @test_vXi32(<2 x i32> %a64, <2 x i32> %b64, <4 x i32> %a128, <4 x i3
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splice-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splice-sizelatency.ll
index 65558da3480e9..208bb6585cc9d 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splice-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splice-sizelatency.ll
@@ -120,21 +120,21 @@ define void @test_vXi32(<2 x i32> %a64, <2 x i32> %b64, <4 x i32> %a128, <4 x i3
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-splice.ll b/llvm/test/Analysis/CostModel/X86/shuffle-splice.ll
index b687df558c1a9..bc19a32e68be0 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-splice.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-splice.ll
@@ -120,21 +120,21 @@ define void @test_vXi32(<2 x i32> %a64, <2 x i32> %b64, <4 x i32> %a128, <4 x i3
 ; SSE2-LABEL: 'test_vXi32'
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi32'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi32'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 1, i32 2>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 57d4d4554a0cd..0668eaa8139a1 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
@@ -385,28 +385,22 @@ define <8 x i32> @add_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
 
 define <8 x i32> @add_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
 ; SSE2-LABEL: @add_v8i32_01234u67(
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <2 x i32> <i32 5, i32 6>
-; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 7>
-; SSE2-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP5]], [[TMP6]]
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 13, i32 14>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 12, i32 15>
 ; SSE2-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[TMP2]], [[TMP3]]
-; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
-; SSE2-NEXT:    ret <8 x i32> [[RESULT]]
+; SSE2-NEXT:    ret <8 x i32> [[TMP4]]
 ;
 ; SSE4-LABEL: @add_v8i32_01234u67(
 ; SSE4-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
 ; SSE4-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
 ; SSE4-NEXT:    [[A45:%.*]] = add i32 [[A4]], [[A5]]
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <2 x i32> <i32 5, i32 6>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 7>
-; SSE4-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP4]], [[TMP5]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
 ; SSE4-NEXT:    [[HADD4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
-; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[TMP4]], [[TMP5]]
 ; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
 ; SSE4-NEXT:    ret <8 x i32> [[RESULT]]
 ;
@@ -735,20 +729,34 @@ define <8 x float> @add_v8f32_01234567(<8 x float> %a, <8 x float> %b) {
 }
 
 define <8 x float> @add_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @add_v8f32_012u4567(
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
-; SSE-NEXT:    [[A67:%.*]] = fadd float [[A6]], [[A7]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
-; SSE-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[TMP4]], [[TMP5]]
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[HADD5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
-; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
-; SSE-NEXT:    ret <8 x float> [[RESULT]]
+; SSE2-LABEL: @add_v8f32_012u4567(
+; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE2-NEXT:    [[A67:%.*]] = fadd float [[A6]], [[A7]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[HADD5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:    ret <8 x float> [[RESULT]]
+;
+; SSE4-LABEL: @add_v8f32_012u4567(
+; SSE4-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE4-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE4-NEXT:    [[A67:%.*]] = fadd float [[A6]], [[A7]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[HADD5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE4-NEXT:    ret <8 x float> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v8f32_012u4567(
 ; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 13, i32 14>
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index db8a774ba20f0..fc93e9deca031 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
@@ -941,16 +941,26 @@ define <4 x double> @sub_v4f64_u123(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @sub_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @sub_v4f64_0u23(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; SSE-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; SSE-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
-; SSE-NEXT:    ret <4 x double> [[RESULT]]
+; SSE2-LABEL: @sub_v4f64_0u23(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_0u23(
+; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 2, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 3, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; AVX-LABEL: @sub_v4f64_0u23(
 ; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
index 5c856ce397c1d..cd2bc757eb9d2 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
@@ -91,20 +91,11 @@ define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) {
   ret <4 x double> %r
 }
 
-; The vector fneg would cost twice as much as the scalar op with SSE,
-; so we don't transform there (the shuffle would also be more expensive).
-
 define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
-; SSE-LABEL: @ext7_v8f32(
-; SSE-NEXT:    [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
-; SSE-NEXT:    [[N:%.*]] = fneg float [[E]]
-; SSE-NEXT:    [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
-; SSE-NEXT:    ret <8 x float> [[R]]
-;
-; AVX-LABEL: @ext7_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
-; AVX-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
-; AVX-NEXT:    ret <8 x float> [[R]]
+; CHECK-LABEL: @ext7_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEXT:    ret <8 x float> [[R]]
 ;
   %e = extractelement <8 x float> %x, i32 7
   %n = fneg float %e

From 5374a3d1d090db1db4de95f1b6c802e8fa0d07d4 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 6 Jan 2025 14:43:28 +0100
Subject: [PATCH 40/49] [InstSimplify] Simplify both operands of select before
 comparing (#121753)

In the simplifySelectWithEquivalence fold, simplify both operands before
comparing them, instead of comparing one simplified operand with a
non-simplified operand. This is slightly more powerful.
---
 llvm/lib/Analysis/InstructionSimplify.cpp  | 18 +++++++++++++-----
 llvm/test/Transforms/InstCombine/select.ll | 14 ++++----------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 8567a0504f54e..515806428cbb2 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4599,13 +4599,21 @@ static Value *simplifySelectWithEquivalence(Value *CmpLHS, Value *CmpRHS,
                                             Value *TrueVal, Value *FalseVal,
                                             const SimplifyQuery &Q,
                                             unsigned MaxRecurse) {
-  if (simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q.getWithoutUndef(),
+  Value *SimplifiedFalseVal =
+      simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q.getWithoutUndef(),
                              /* AllowRefinement */ false,
-                             /* DropFlags */ nullptr, MaxRecurse) == TrueVal)
-    return FalseVal;
-  if (simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q,
+                             /* DropFlags */ nullptr, MaxRecurse);
+  if (!SimplifiedFalseVal)
+    SimplifiedFalseVal = FalseVal;
+
+  Value *SimplifiedTrueVal =
+      simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q,
                              /* AllowRefinement */ true,
-                             /* DropFlags */ nullptr, MaxRecurse) == FalseVal)
+                             /* DropFlags */ nullptr, MaxRecurse);
+  if (!SimplifiedTrueVal)
+    SimplifiedTrueVal = TrueVal;
+
+  if (SimplifiedFalseVal == SimplifiedTrueVal)
     return FalseVal;
 
   return nullptr;
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index 0168a804239a8..9de3c2483ba49 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -4453,11 +4453,8 @@ define i32 @src_no_trans_select_or_eq0_or_and(i32 %x, i32 %y) {
 
 define i32 @src_no_trans_select_or_eq0_or_xor(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_no_trans_select_or_eq0_or_xor(
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[OR0:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0]], i32 0, i32 [[XOR]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %or = or i32 %x, %y
   %or0 = icmp eq i32 %or, 0
@@ -4492,11 +4489,8 @@ define i32 @src_no_trans_select_or_eq0_xor_or(i32 %x, i32 %y) {
 
 define i32 @src_no_trans_select_and_ne0_xor_or(i32 %x, i32 %y) {
 ; CHECK-LABEL: @src_no_trans_select_and_ne0_xor_or(
-; CHECK-NEXT:    [[OR:%.*]] = or i32 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[OR0_NOT:%.*]] = icmp eq i32 [[OR]], 0
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X]], [[Y]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR0_NOT]], i32 0, i32 [[XOR]]
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %or = or i32 %x, %y
   %or0 = icmp ne i32 %or, 0

From c316730c04d37c0c3f992a8445ccc896a9f64fa4 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Mon, 6 Jan 2025 21:43:52 +0800
Subject: [PATCH 41/49] [InstCombine] Handle commuted pattern for `((X s/ C1)
 << C2) + X` (#121737)

Closes https://github.com/llvm/llvm-project/issues/121700
---
 .../InstCombine/InstCombineAddSub.cpp         | 24 ++++++++++---------
 .../InstCombine/add-shl-sdiv-to-srem.ll       | 11 +++++++++
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 9dc593bdf3058..73876d00e73a7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1326,6 +1326,18 @@ Instruction *InstCombinerImpl::foldAddLikeCommutative(Value *LHS, Value *RHS,
     R->setHasNoUnsignedWrap(NUWOut);
     return R;
   }
+
+  // ((X s/ C1) << C2) + X => X s% -C1 where -C1 is 1 << C2
+  const APInt *C1, *C2;
+  if (match(LHS, m_Shl(m_SDiv(m_Specific(RHS), m_APInt(C1)), m_APInt(C2)))) {
+    APInt One(C2->getBitWidth(), 1);
+    APInt MinusC1 = -(*C1);
+    if (MinusC1 == (One << *C2)) {
+      Constant *NewRHS = ConstantInt::get(RHS->getType(), MinusC1);
+      return BinaryOperator::CreateSRem(RHS, NewRHS);
+    }
+  }
+
   return nullptr;
 }
 
@@ -1623,17 +1635,7 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   // X % C0 + (( X / C0 ) % C1) * C0 => X % (C0 * C1)
   if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V);
 
-  // ((X s/ C1) << C2) + X => X s% -C1 where -C1 is 1 << C2
-  const APInt *C1, *C2;
-  if (match(LHS, m_Shl(m_SDiv(m_Specific(RHS), m_APInt(C1)), m_APInt(C2)))) {
-    APInt one(C2->getBitWidth(), 1);
-    APInt minusC1 = -(*C1);
-    if (minusC1 == (one << *C2)) {
-      Constant *NewRHS = ConstantInt::get(RHS->getType(), minusC1);
-      return BinaryOperator::CreateSRem(RHS, NewRHS);
-    }
-  }
-
+  const APInt *C1;
   // (A & 2^C1) + A => A & (2^C1 - 1) iff bit C1 in A is a sign bit
   if (match(&I, m_c_Add(m_And(m_Value(A), m_APInt(C1)), m_Deferred(A))) &&
       C1->isPowerOf2() && (ComputeNumSignBits(A) > C1->countl_zero())) {
diff --git a/llvm/test/Transforms/InstCombine/add-shl-sdiv-to-srem.ll b/llvm/test/Transforms/InstCombine/add-shl-sdiv-to-srem.ll
index 84462f9a7f592..d4edf12eba6ac 100644
--- a/llvm/test/Transforms/InstCombine/add-shl-sdiv-to-srem.ll
+++ b/llvm/test/Transforms/InstCombine/add-shl-sdiv-to-srem.ll
@@ -12,6 +12,17 @@ define i8 @add-shl-sdiv-scalar0(i8 %x) {
   ret i8 %rz
 }
 
+define i8 @add-shl-sdiv-scalar0_commuted(i8 %x) {
+; CHECK-LABEL: @add-shl-sdiv-scalar0_commuted(
+; CHECK-NEXT:    [[RZ:%.*]] = srem i8 [[X:%.*]], 4
+; CHECK-NEXT:    ret i8 [[RZ]]
+;
+  %sd = sdiv i8 %x, -4
+  %sl = shl i8 %sd, 2
+  %rz = add i8 %x, %sl
+  ret i8 %rz
+}
+
 define i8 @add-shl-sdiv-scalar1(i8 %x) {
 ; CHECK-LABEL: @add-shl-sdiv-scalar1(
 ; CHECK-NEXT:    [[RZ:%.*]] = srem i8 [[X:%.*]], 64

From a3a14a5051db099f4f375f1e1d6f57b2b956fda2 Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Mon, 6 Jan 2025 14:48:50 +0100
Subject: [PATCH 42/49] [Clang] Make passing incomplete types to builtin
 type-traits a non-sfinae-friendly error (#121333)

LWG3929 suggests that passing incomplete types to __is_base_of and other
builtins supporting [meta.unary] should result in a non-sfinaeable
error.

This is consistent with GCC's behavior and avoid inconsistency when
using a builtin instead of a standard trait in a concept-definition.

Fixes #121278
---
 clang/docs/ReleaseNotes.rst                      |  3 +++
 clang/include/clang/Basic/DiagnosticSemaKinds.td |  2 +-
 clang/test/SemaCXX/type-traits.cpp               | 15 +++++++++++++++
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index f2b27893b7a9d..acd9dd9298ce1 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -886,6 +886,9 @@ Bug Fixes to C++ Support
   out of a module (which is the case e.g. in MSVC's implementation of ``std`` module). (#GH118218)
 - Fixed a pack expansion issue in checking unexpanded parameter sizes. (#GH17042)
 - Fixed a bug where captured structured bindings were modifiable inside non-mutable lambda (#GH95081)
+- Passing incomplete types to ``__is_base_of`` and other builtin type traits for which the corresponding
+  standard type trait mandates a complete type is now a hard (non-sfinae-friendly) error
+  (`LWG3929 <https://wg21.link/LWG3929>`__.) (#GH121278)
 - Clang now identifies unexpanded parameter packs within the type constraint on a non-type template parameter. (#GH88866)
 - Fixed an issue while resolving type of expression indexing into a pack of values of non-dependent type (#GH121242)
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 330ae045616ab..03fb7ca9bc3c3 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9361,7 +9361,7 @@ def note_inequality_comparison_to_or_assign : Note<
   "use '|=' to turn this inequality comparison into an or-assignment">;
 
 def err_incomplete_type_used_in_type_trait_expr : Error<
-  "incomplete type %0 used in type trait expression">;
+  "incomplete type %0 used in type trait expression">, NoSFINAE;
 
 // C++20 constinit and require_constant_initialization attribute
 def warn_cxx20_compat_constinit : Warning<
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index 91ef7786f11bb..1b9e2ba6ff162 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -5031,3 +5031,18 @@ void remove_all_extents() {
   using SomeArray = int[1][2];
   static_assert(__is_same(remove_all_extents_t<const SomeArray>, const int));
 }
+
+namespace GH121278 {
+// https://cplusplus.github.io/LWG/lwg-active.html#3929
+#if __cplusplus >= 202002L
+template <typename B, typename D>
+concept C = __is_base_of(B, D);
+// expected-error@-1 {{incomplete type 'GH121278::S' used in type trait expression}}
+// expected-note@-2 {{while substituting template arguments into constraint expression here}}
+
+struct T;
+struct S;
+bool b = C<T, S>;
+// expected-note@-1 {{while checking the satisfaction of concept 'C<GH121278::T, GH121278::S>' requested here}}
+#endif
+}

From ba8755d905ba83395db18c477a100c7247d709c3 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 6 Jan 2025 08:11:08 -0600
Subject: [PATCH 43/49] [Clang][AMDGPU] Stop defaulting to `one-as` for all
 atomic scopes (#120095)

Summary:
The documentation at
https://llvm.org/docs/AMDGPUUsage.html#memory-scopes states that these
'one-as' modifiers are more specific versions of the scopes that only
apply to a specific address space. This doesn't make sense for fences
which have no associated address space to use, and it's a more
restrictive version the normal scope. This should not tbe the default
behavior, but it is currently emitted in all cases except for
sequentially consistent.
---
 clang/lib/CodeGen/Targets/AMDGPU.cpp        |   6 +-
 clang/test/CodeGen/scoped-atomic-ops.c      | 162 +++++------
 clang/test/CodeGen/scoped-fence-ops.c       |  22 +-
 clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu |  60 ++---
 clang/test/CodeGenCUDA/atomic-ops.cu        | 280 ++++++++++----------
 5 files changed, 269 insertions(+), 261 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 56ad0503a11ab..fa07e68c55835 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -537,7 +537,11 @@ AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
     break;
   }
 
-  if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
+  // OpenCL assumes by default that atomic scopes are per-address space for
+  // non-sequentially consistent operations.
+  if (Scope >= SyncScope::OpenCLWorkGroup &&
+      Scope <= SyncScope::OpenCLSubGroup &&
+      Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
     if (!Name.empty())
       Name = Twine(Twine(Name) + Twine("-")).str();
 
diff --git a/clang/test/CodeGen/scoped-atomic-ops.c b/clang/test/CodeGen/scoped-atomic-ops.c
index cf98812a07e91..545a6c90892c2 100644
--- a/clang/test/CodeGen/scoped-atomic-ops.c
+++ b/clang/test/CodeGen/scoped-atomic-ops.c
@@ -1,15 +1,17 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
 // RUN:   -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
+// RUN:   -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=spirv64-unknown-unknown -ffreestanding \
 // RUN:   -fvisibility=hidden | FileCheck --check-prefix=SPIRV %s
 
 // AMDGCN-LABEL: define hidden i32 @fi1a(
-// AMDGCN:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:.+]] syncscope("singlethread-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:.+]] monotonic, align 4
+// AMDGCN:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:.+]] syncscope("singlethread") monotonic, align 4
 // SPIRV: define hidden spir_func i32 @fi1a(
 // SPIRV:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:.+]] monotonic, align 4
 // SPIRV:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:.+]] syncscope("device") monotonic, align 4
@@ -27,11 +29,11 @@ int fi1a(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden i32 @fi1b(
-// AMDGCN:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:%.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:%.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:%.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:%.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:%.+]] syncscope("singlethread-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:%.+]] monotonic, align 4
+// AMDGCN:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:%.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:%.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func i32 @fi1b(
 // SPIRV:    [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:%.+]] monotonic, align 4
 // SPIRV:    [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:%.+]] syncscope("device") monotonic, align 4
@@ -48,11 +50,11 @@ int fi1b(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden void @fi2a(
-// AMDGCN:    store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread-one-as") monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func void @fi2a(
 // SPIRV:    store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4
 // SPIRV:    store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("device") monotonic, align 4
@@ -69,11 +71,11 @@ void fi2a(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden void @fi2b(
-// AMDGCN:    store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread-one-as") monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func void @fi2b(
 // SPIRV:    store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4
 // SPIRV:    store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("device") monotonic, align 4
@@ -89,14 +91,14 @@ void fi2b(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden void @fi3a(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("one-as") monotonic, align 4
-// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4
+// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] monotonic, align 4
+// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] monotonic, align 4
+// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] monotonic, align 4
+// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] monotonic, align 4
+// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] monotonic, align 4
+// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] monotonic, align 4
+// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func void @fi3a(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4
 // SPIRV:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] monotonic, align 4
@@ -118,14 +120,14 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 }
 
 // AMDGCN-LABEL: define hidden void @fi3b(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("agent-one-as") monotonic, align 4
-// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("agent-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("agent") monotonic, align 4
+// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("agent") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func void @fi3b(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("device") monotonic, align 4
 // SPIRV:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("device") monotonic, align 4
@@ -147,14 +149,14 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 }
 
 // AMDGCN-LABEL: define hidden void @fi3c(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("workgroup-one-as") monotonic, align 4
-// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("workgroup-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("workgroup") monotonic, align 4
+// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("workgroup") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func void @fi3c(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4
 // SPIRV:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("workgroup") monotonic, align 4
@@ -176,14 +178,14 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 }
 
 // AMDGCN-LABEL: define hidden void @fi3d(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("wavefront-one-as") monotonic, align 4
-// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("wavefront-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("wavefront") monotonic, align 4
+// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("wavefront") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func void @fi3d(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("subgroup") monotonic, align 4
 // SPIRV:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("subgroup") monotonic, align 4
@@ -205,14 +207,14 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 }
 
 // AMDGCN-LABEL: define hidden void @fi3e(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread-one-as") monotonic, align 4
-// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("singlethread-one-as") monotonic, align 4
-// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("singlethread-one-as") monotonic, align 4
-// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("singlethread-one-as") monotonic, align 4
-// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("singlethread-one-as") monotonic, align 4
-// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("singlethread-one-as") monotonic, align 4
-// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("singlethread-one-as") monotonic, align 4
-// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("singlethread-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN:    [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN:    [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN:    [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN:    [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN:    [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("singlethread") monotonic, align 4
+// AMDGCN:    [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("singlethread") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func void @fi3e(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4
 // SPIRV:    [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("singlethread") monotonic, align 4
@@ -234,7 +236,7 @@ void fi3e(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi4a(
-// AMDGCN-DAG:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("one-as") acquire acquire, align 4
+// AMDGCN-DAG:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4a(
 // SPIRV-DAG:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4
 _Bool fi4a(int *i) {
@@ -246,7 +248,7 @@ _Bool fi4a(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi4b(
-// AMDGCN-DAG:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("agent-one-as") acquire acquire, align 4
+// AMDGCN-DAG:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("agent") acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4b(
 // SPIRV-DAG:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("device") acquire acquire, align 4
 _Bool fi4b(int *i) {
@@ -258,7 +260,7 @@ _Bool fi4b(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi4c(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup-one-as") acquire acquire, align 4
+// AMDGCN:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4c(
 // SPIRV:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4
 _Bool fi4c(int *i) {
@@ -270,7 +272,7 @@ _Bool fi4c(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi4d(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("wavefront-one-as") acquire acquire, align 4
+// AMDGCN:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("wavefront") acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4d(
 // SPIRV:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("subgroup") acquire acquire, align 4
 _Bool fi4d(int *i) {
@@ -282,7 +284,7 @@ _Bool fi4d(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi4e(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread-one-as") acquire acquire, align 4
+// AMDGCN:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4e(
 // SPIRV:    [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4
 _Bool fi4e(int *i) {
@@ -294,7 +296,7 @@ _Bool fi4e(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi5a(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("one-as") acquire acquire, align 4
+// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5a(
 // SPIRV:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4
 _Bool fi5a(int *i) {
@@ -305,7 +307,7 @@ _Bool fi5a(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi5b(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("agent-one-as") acquire acquire, align 4
+// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("agent") acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5b(
 // SPIRV:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("device") acquire acquire, align 4
 _Bool fi5b(int *i) {
@@ -316,7 +318,7 @@ _Bool fi5b(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi5c(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup-one-as") acquire acquire, align 4
+// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5c(
 // SPIRV:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4
 _Bool fi5c(int *i) {
@@ -326,7 +328,7 @@ _Bool fi5c(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi5d(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("wavefront-one-as") acquire acquire, align 4
+// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("wavefront") acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5d(
 // SPIRV:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("subgroup") acquire acquire, align 4
 _Bool fi5d(int *i) {
@@ -336,7 +338,7 @@ _Bool fi5d(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi5e(
-// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread-one-as") acquire acquire, align 4
+// AMDGCN:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5e(
 // SPIRV:    [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4
 _Bool fi5e(int *i) {
@@ -346,7 +348,7 @@ _Bool fi5e(int *i) {
 }
 
 // AMDGCN-LABEL: define hidden i32 @fi6a(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func i32 @fi6a(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4
 int fi6a(int *c, int *d) {
@@ -356,7 +358,7 @@ int fi6a(int *c, int *d) {
 }
 
 // AMDGCN-LABEL: define hidden i32 @fi6b(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("agent-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("agent") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func i32 @fi6b(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("device") monotonic, align 4
 int fi6b(int *c, int *d) {
@@ -366,7 +368,7 @@ int fi6b(int *c, int *d) {
 }
 
 // AMDGCN-LABEL: define hidden i32 @fi6c(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func i32 @fi6c(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4
 int fi6c(int *c, int *d) {
@@ -376,7 +378,7 @@ int fi6c(int *c, int *d) {
 }
 
 // AMDGCN-LABEL: define hidden i32 @fi6d(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("wavefront-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("wavefront") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func i32 @fi6d(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("subgroup") monotonic, align 4
 int fi6d(int *c, int *d) {
@@ -386,7 +388,7 @@ int fi6d(int *c, int *d) {
 }
 
 // AMDGCN-LABEL: define hidden i32 @fi6e(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread-one-as") monotonic, align 4
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4
 // SPIRV-LABEL: define hidden spir_func i32 @fi6e(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4
 int fi6e(int *c, int *d) {
@@ -396,7 +398,7 @@ int fi6e(int *c, int *d) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi7a(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("one-as") monotonic, align 1
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] monotonic, align 1
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7a(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] monotonic, align 1
 _Bool fi7a(_Bool *c) {
@@ -405,7 +407,7 @@ _Bool fi7a(_Bool *c) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi7b(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("agent-one-as") monotonic, align 1
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("agent") monotonic, align 1
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7b(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("device") monotonic, align 1
 _Bool fi7b(_Bool *c) {
@@ -414,7 +416,7 @@ _Bool fi7b(_Bool *c) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi7c(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("workgroup-one-as") monotonic, align 1
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("workgroup") monotonic, align 1
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7c(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("workgroup") monotonic, align 1
 _Bool fi7c(_Bool *c) {
@@ -423,7 +425,7 @@ _Bool fi7c(_Bool *c) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi7d(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("wavefront-one-as") monotonic, align 1
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("wavefront") monotonic, align 1
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7d(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("subgroup") monotonic, align 1
 _Bool fi7d(_Bool *c) {
@@ -432,7 +434,7 @@ _Bool fi7d(_Bool *c) {
 }
 
 // AMDGCN-LABEL: define hidden zeroext i1 @fi7e(
-// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("singlethread-one-as") monotonic, align 1
+// AMDGCN:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("singlethread") monotonic, align 1
 // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7e(
 // SPIRV:    [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("singlethread") monotonic, align 1
 _Bool fi7e(_Bool *c) {
diff --git a/clang/test/CodeGen/scoped-fence-ops.c b/clang/test/CodeGen/scoped-fence-ops.c
index 376cb11e84d3d..d83ae05b0aea2 100644
--- a/clang/test/CodeGen/scoped-fence-ops.c
+++ b/clang/test/CodeGen/scoped-fence-ops.c
@@ -1,6 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
 // RUN:   -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \
+// RUN:   -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=spirv64-unknown-unknown -ffreestanding \
 // RUN:   -fvisibility=hidden | FileCheck --check-prefix=SPIRV %s
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=x86_64-unknown-linux-gnu -ffreestanding \
@@ -9,7 +11,7 @@
 // AMDGCN-LABEL: define hidden void @fe1a(
 // AMDGCN-SAME: ) #[[ATTR0:[0-9]+]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    fence syncscope("workgroup-one-as") release
+// AMDGCN-NEXT:    fence syncscope("workgroup") release
 // AMDGCN-NEXT:    ret void
 //
 // SPIRV-LABEL: define hidden spir_func void @fe1a(
@@ -45,13 +47,13 @@ void fe1a() {
 // AMDGCN:       [[ATOMIC_SCOPE_CONTINUE]]:
 // AMDGCN-NEXT:    ret void
 // AMDGCN:       [[ACQUIRE]]:
-// AMDGCN-NEXT:    fence syncscope("workgroup-one-as") acquire
+// AMDGCN-NEXT:    fence syncscope("workgroup") acquire
 // AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // AMDGCN:       [[RELEASE]]:
-// AMDGCN-NEXT:    fence syncscope("workgroup-one-as") release
+// AMDGCN-NEXT:    fence syncscope("workgroup") release
 // AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // AMDGCN:       [[ACQREL]]:
-// AMDGCN-NEXT:    fence syncscope("workgroup-one-as") acq_rel
+// AMDGCN-NEXT:    fence syncscope("workgroup") acq_rel
 // AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // AMDGCN:       [[SEQCST]]:
 // AMDGCN-NEXT:    fence syncscope("workgroup") seq_cst
@@ -134,19 +136,19 @@ void fe1b(int ord) {
 // AMDGCN:       [[ATOMIC_SCOPE_CONTINUE]]:
 // AMDGCN-NEXT:    ret void
 // AMDGCN:       [[DEVICE_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("agent-one-as") release
+// AMDGCN-NEXT:    fence syncscope("agent") release
 // AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // AMDGCN:       [[SYSTEM_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("one-as") release
+// AMDGCN-NEXT:    fence release
 // AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // AMDGCN:       [[WORKGROUP_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("workgroup-one-as") release
+// AMDGCN-NEXT:    fence syncscope("workgroup") release
 // AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // AMDGCN:       [[WAVEFRONT_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("wavefront-one-as") release
+// AMDGCN-NEXT:    fence syncscope("wavefront") release
 // AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 // AMDGCN:       [[SINGLE_SCOPE]]:
-// AMDGCN-NEXT:    fence syncscope("singlethread-one-as") release
+// AMDGCN-NEXT:    fence syncscope("singlethread") release
 // AMDGCN-NEXT:    br label %[[ATOMIC_SCOPE_CONTINUE]]
 //
 // SPIRV-LABEL: define hidden spir_func void @fe1c(
@@ -237,7 +239,7 @@ void fe2a() {
 // AMDGCN-LABEL: define hidden void @fe2b(
 // AMDGCN-SAME: ) #[[ATTR0]] {
 // AMDGCN-NEXT:  [[ENTRY:.*:]]
-// AMDGCN-NEXT:    fence syncscope("one-as") release
+// AMDGCN-NEXT:    fence release
 // AMDGCN-NEXT:    ret void
 //
 // SPIRV-LABEL: define hidden spir_func void @fe2b(
diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
index 0e5fe8fa35cf1..47fa3967fe237 100644
--- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
@@ -26,10 +26,10 @@ __global__ void ffp1(float *p) {
   // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4{{$}}
   // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 4{{$}}
   // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 4{{$}}
-  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE:[0-9]+]]{{$}}
-  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE:[0-9]+]]{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
 
   // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
@@ -37,8 +37,8 @@ __global__ void ffp1(float *p) {
   // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE:[0-9]+]], !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
 
   // SAFE: _Z4ffp1Pf
   // SAFE: global_atomic_cmpswap
@@ -73,19 +73,19 @@ __global__ void ffp2(double *p) {
   // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
   // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8{{$}}
   // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8{{$}}
-  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
 
   // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
 
   // SAFE-LABEL: @_Z4ffp2Pd
   // SAFE: global_atomic_cmpswap_b64
@@ -119,19 +119,19 @@ __global__ void ffp3(long double *p) {
   // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
   // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8{{$}}
   // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8{{$}}
-  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
 
   // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
 
   // SAFE-LABEL: @_Z4ffp3Pe
   // SAFE: global_atomic_cmpswap_b64
@@ -185,10 +185,10 @@ __global__ void ffp6(_Float16 *p) {
   // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2{{$}}
   // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 2{{$}}
   // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 2{{$}}
-  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
 
   // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
@@ -196,8 +196,8 @@ __global__ void ffp6(_Float16 *p) {
   // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
 
   // SAFE: _Z4ffp6PDF16
   // SAFE: global_atomic_cmpswap
@@ -228,8 +228,8 @@ __global__ void ffp6(_Float16 *p) {
 // CHECK-LABEL: @_Z12test_cmpxchgPiii
 // CHECK: cmpxchg ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} acquire acquire, align 4{{$}}
 // CHECK: cmpxchg weak ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} acquire acquire, align 4{{$}}
-// CHECK: cmpxchg ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} syncscope("workgroup-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
-// CHECK: cmpxchg weak ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} syncscope("workgroup-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+// CHECK: cmpxchg ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} syncscope("workgroup") monotonic monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+// CHECK: cmpxchg weak ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} syncscope("workgroup") monotonic monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
 __device__ int test_cmpxchg(int *ptr, int cmp, int desired) {
   bool flag = __atomic_compare_exchange(ptr, &cmp, &desired, 0, memory_order_acquire, memory_order_acquire);
   flag = __atomic_compare_exchange_n(ptr, &cmp, desired, 1, memory_order_acquire, memory_order_acquire);
diff --git a/clang/test/CodeGenCUDA/atomic-ops.cu b/clang/test/CodeGenCUDA/atomic-ops.cu
index 1accd1712beca..d8489b438015d 100644
--- a/clang/test/CodeGenCUDA/atomic-ops.cu
+++ b/clang/test/CodeGenCUDA/atomic-ops.cu
@@ -2,18 +2,18 @@
 #include "Inputs/cuda.h"
 
 // CHECK-LABEL: @_Z24atomic32_op_singlethreadPiii
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK:[0-9]+]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: load atomic i32, ptr {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4{{$}}
-// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 4{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK:[0-9]+]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: load atomic i32, ptr {{%[0-9]+}} syncscope("singlethread") monotonic, align 4{{$}}
+// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("singlethread") monotonic, align 4{{$}}
 __device__ int atomic32_op_singlethread(int *ptr, int val, int desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
@@ -31,8 +31,8 @@ __device__ int atomic32_op_singlethread(int *ptr, int val, int desired) {
 }
 
 // CHECK-LABEL: @_Z25atomicu32_op_singlethreadPjjj
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 __device__ unsigned int atomicu32_op_singlethread(unsigned int *ptr, unsigned int val, unsigned int desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
@@ -40,18 +40,18 @@ __device__ unsigned int atomicu32_op_singlethread(unsigned int *ptr, unsigned in
 }
 
 // CHECK-LABEL: @_Z21atomic32_op_wavefrontPiii
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: load atomic i32, ptr {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4{{$}}
-// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 4{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: load atomic i32, ptr {{%[0-9]+}} syncscope("wavefront") monotonic, align 4{{$}}
+// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("wavefront") monotonic, align 4{{$}}
 __device__ int atomic32_op_wavefront(int *ptr, int val, int desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
@@ -69,8 +69,8 @@ __device__ int atomic32_op_wavefront(int *ptr, int val, int desired) {
 }
 
 // CHECK-LABEL: @_Z22atomicu32_op_wavefrontPjjj
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 __device__ unsigned int atomicu32_op_wavefront(unsigned int *ptr, unsigned int val, unsigned int desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
@@ -78,17 +78,17 @@ __device__ unsigned int atomicu32_op_wavefront(unsigned int *ptr, unsigned int v
 }
 
 // CHECK-LABEL: @_Z21atomic32_op_workgroupPiii
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 4{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("workgroup") monotonic, align 4{{$}}
 __device__ int atomic32_op_workgroup(int *ptr, int val, int desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
@@ -105,8 +105,8 @@ __device__ int atomic32_op_workgroup(int *ptr, int val, int desired) {
 }
 
 // CHECK-LABEL: @_Z22atomicu32_op_workgroupPjjj
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 __device__ unsigned int atomicu32_op_workgroup(unsigned int *ptr, unsigned int val, unsigned int desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
@@ -114,17 +114,17 @@ __device__ unsigned int atomicu32_op_workgroup(unsigned int *ptr, unsigned int v
 }
 
 // CHECK-LABEL: @_Z17atomic32_op_agentPiii
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 4{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("agent") monotonic, align 4{{$}}
 __device__ int atomic32_op_agent(int *ptr, int val, int desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -141,8 +141,8 @@ __device__ int atomic32_op_agent(int *ptr, int val, int desired) {
 }
 
 // CHECK-LABEL: @_Z18atomicu32_op_agentPjjj
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 __device__ unsigned int atomicu32_op_agent(unsigned int *ptr, unsigned int val, unsigned int desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -150,18 +150,18 @@ __device__ unsigned int atomicu32_op_agent(unsigned int *ptr, unsigned int val,
 }
 
 // CHECK-LABEL: @_Z18atomic32_op_systemPiii
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 // CHECK: load i32, ptr %{{.*}}, align 4{{$}}
-// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("one-as") monotonic, align 4{{$}}
+// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} monotonic, align 4{{$}}
 __device__ int atomic32_op_system(int *ptr, int val, int desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
@@ -179,8 +179,8 @@ __device__ int atomic32_op_system(int *ptr, int val, int desired) {
 }
 
 // CHECK-LABEL: @_Z19atomicu32_op_systemPjjj
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 __device__ unsigned int atomicu32_op_system(unsigned int *ptr, unsigned int val, unsigned int desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
@@ -188,17 +188,17 @@ __device__ unsigned int atomicu32_op_system(unsigned int *ptr, unsigned int val,
 }
 
 // CHECK-LABEL: @_Z24atomic64_op_singlethreadPxS_xx
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 8{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("singlethread") monotonic, align 8{{$}}
 __device__ long long atomic64_op_singlethread(long long *ptr, long long *ptr2, long long val, long long desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
@@ -215,10 +215,10 @@ __device__ long long atomic64_op_singlethread(long long *ptr, long long *ptr2, l
 }
 
 // CHECK-LABEL: @_Z25atomicu64_op_singlethreadPyS_yy
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: load atomic i64, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 8{{$}}
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 8{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: load atomic i64, ptr %{{.*}} syncscope("singlethread") monotonic, align 8{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("singlethread") monotonic, align 8{{$}}
 __device__ unsigned long long atomicu64_op_singlethread(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
@@ -228,18 +228,18 @@ __device__ unsigned long long atomicu64_op_singlethread(unsigned long long *ptr,
 }
 
 // CHECK-LABEL: @_Z21atomic64_op_wavefrontPxS_xx
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: load atomic i64, ptr {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8{{$}}
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 8{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: load atomic i64, ptr {{%[0-9]+}} syncscope("wavefront") monotonic, align 8{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("wavefront") monotonic, align 8{{$}}
 __device__ long long atomic64_op_wavefront(long long *ptr, long long *ptr2, long long val, long long desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
@@ -257,10 +257,10 @@ __device__ long long atomic64_op_wavefront(long long *ptr, long long *ptr2, long
 }
 
 // CHECK-LABEL: @_Z22atomicu64_op_wavefrontPyS_yy
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: load atomic i64, ptr {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8{{$}}
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 8{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: load atomic i64, ptr {{%[0-9]+}} syncscope("wavefront") monotonic, align 8{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("wavefront") monotonic, align 8{{$}}
 __device__ unsigned long long atomicu64_op_wavefront(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
@@ -270,17 +270,17 @@ __device__ unsigned long long atomicu64_op_wavefront(unsigned long long *ptr, un
 }
 
 // CHECK-LABEL: @_Z21atomic64_op_workgroupPxS_xx
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("workgroup") monotonic, align 8{{$}}
 __device__ long long atomic64_op_workgroup(long long *ptr, long long *ptr2, long long val, long long desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
@@ -297,9 +297,9 @@ __device__ long long atomic64_op_workgroup(long long *ptr, long long *ptr2, long
 }
 
 // CHECK-LABEL: @_Z22atomicu64_op_workgroupPyS_yy
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("workgroup") monotonic, align 8{{$}}
 __device__ unsigned long long atomicu64_op_workgroup(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
@@ -308,17 +308,17 @@ __device__ unsigned long long atomicu64_op_workgroup(unsigned long long *ptr, un
 }
 
 // CHECK-LABEL: @_Z17atomic64_op_agentPxS_xx
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 8{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("agent") monotonic, align 8{{$}}
 __device__ long long atomic64_op_agent(long long *ptr, long long *ptr2, long long val, long long desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -335,9 +335,9 @@ __device__ long long atomic64_op_agent(long long *ptr, long long *ptr2, long lon
 }
 
 // CHECK-LABEL: @_Z18atomicu64_op_agentPyS_yy
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 8{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("agent") monotonic, align 8{{$}}
 __device__ unsigned long long atomicu64_op_agent(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -346,18 +346,18 @@ __device__ unsigned long long atomicu64_op_agent(unsigned long long *ptr, unsign
 }
 
 // CHECK-LABEL: @_Z18atomic64_op_systemPxS_xx
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 // CHECK: load i64, ptr %{{.*}}, align 8
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("one-as") monotonic, align 8{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} monotonic, align 8{{$}}
 __device__ long long atomic64_op_system(long long *ptr, long long *ptr2, long long val, long long desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
@@ -375,10 +375,10 @@ __device__ long long atomic64_op_system(long long *ptr, long long *ptr2, long lo
 }
 
 // CHECK-LABEL: @_Z19atomicu64_op_systemPyS_yy
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 // CHECK: load i64, ptr %{{.*}}, align 8
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("one-as") monotonic, align 8{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} monotonic, align 8{{$}}
 __device__ unsigned long long atomicu64_op_system(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);

From b4559c75070398e975fc16aacec5fcebfa422120 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Mon, 6 Jan 2025 16:25:25 +0300
Subject: [PATCH 44/49] [clang][NFC] Move CWG273 test into its own file

---
 clang/test/CXX/drs/cwg273.cpp | 24 ++++++++++++++++++++++++
 clang/test/CXX/drs/cwg2xx.cpp | 16 +---------------
 2 files changed, 25 insertions(+), 15 deletions(-)
 create mode 100644 clang/test/CXX/drs/cwg273.cpp

diff --git a/clang/test/CXX/drs/cwg273.cpp b/clang/test/CXX/drs/cwg273.cpp
new file mode 100644
index 0000000000000..934ebdb919a4b
--- /dev/null
+++ b/clang/test/CXX/drs/cwg273.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -std=c++98 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++11 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++14 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++17 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors
+
+// expected-no-diagnostics
+
+#include <stdarg.h>
+#include <stddef.h>
+namespace cwg273 { // cwg273: yes
+  struct A {
+    int n;
+  };
+  void operator&(A);
+  void f(A a, ...) {
+    offsetof(A, n);
+    va_list val;
+    va_start(val, a);
+    va_end(val);
+  }
+} // namespace cwg273
diff --git a/clang/test/CXX/drs/cwg2xx.cpp b/clang/test/CXX/drs/cwg2xx.cpp
index f9010235b8247..9ac93d78d747e 100644
--- a/clang/test/CXX/drs/cwg2xx.cpp
+++ b/clang/test/CXX/drs/cwg2xx.cpp
@@ -950,21 +950,7 @@ namespace cwg272 { // cwg272: yes
   };
 } // namespace cwg272
 
-#include <stdarg.h>
-#include <stddef.h>
-namespace cwg273 { // cwg273: yes
-  struct A {
-    int n;
-  };
-  void operator&(A);
-  void f(A a, ...) {
-    offsetof(A, n);
-    va_list val;
-    va_start(val, a);
-    va_end(val);
-  }
-} // namespace cwg273
-
+// cwg273 is in cwg273.cpp
 // cwg274: na
 
 namespace cwg275 { // cwg275: no

From ab16ce4957f236732ee3df824ac3381efeef49ef Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 6 Jan 2025 15:32:09 +0100
Subject: [PATCH 45/49] [InstCombine] Add additional tests for icmp of phi of
 zext (NFC)

---
 llvm/test/Transforms/InstCombine/phi.ll | 177 ++++++++++++++++++++++++
 1 file changed, 177 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll
index e3bf5ef673819..33849291d832f 100644
--- a/llvm/test/Transforms/InstCombine/phi.ll
+++ b/llvm/test/Transforms/InstCombine/phi.ll
@@ -2822,3 +2822,180 @@ for.cond:                                         ; preds = %for.cond, %entry
 exit:                                             ; preds = %for.cond
   ret i64 0
 }
+
+define i1 @test_zext_icmp_eq_0(i1 %a, i1 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_eq_0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[B_EXT:%.*]] = zext i1 [[B:%.*]] to i32
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[B_EXT]], [[IF]] ], [ [[C:%.*]], [[ELSE]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i1 %b to i32
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp eq i32 %phi, 0
+  ret i1 %cmp
+}
+
+define i1 @test_zext_icmp_ne_0(i1 %a, i1 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_ne_0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ne i32 [[C:%.*]], 0
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i1 [ [[B:%.*]], [[IF]] ], [ [[TMP0]], [[ELSE]] ]
+; CHECK-NEXT:    ret i1 [[PHI]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i1 %b to i32
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp ne i32 %phi, 0
+  ret i1 %cmp
+}
+
+define i1 @test_zext_icmp_eq_1(i1 %a, i1 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_eq_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[C:%.*]], 1
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i1 [ [[B:%.*]], [[IF]] ], [ [[TMP0]], [[ELSE]] ]
+; CHECK-NEXT:    ret i1 [[PHI]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i1 %b to i32
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp eq i32 %phi, 1
+  ret i1 %cmp
+}
+
+define i1 @test_zext_icmp_eq_0_loop(i1 %c, i1 %b) {
+; CHECK-LABEL: @test_zext_icmp_eq_0_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[EXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[X:%.*]] = icmp eq i32 [[PHI]], 0
+; CHECK-NEXT:    [[Y:%.*]] = and i1 [[X]], [[B:%.*]]
+; CHECK-NEXT:    [[EXT]] = zext i1 [[Y]] to i32
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i1 [[X]]
+;
+entry:
+  br label %loop
+
+loop:
+  %phi = phi i32 [ 1, %entry ], [ %ext, %loop ]
+  %x = icmp eq i32 %phi, 0
+  %y = and i1 %x, %b
+  %ext = zext i1 %y to i32
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret i1 %x
+}
+
+define i1 @test_zext_icmp_eq_0_multi_use(i1 %a, i1 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_eq_0_multi_use(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[B_EXT:%.*]] = zext i1 [[B:%.*]] to i32
+; CHECK-NEXT:    call void @use(i32 [[B_EXT]])
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[B_EXT]], [[IF]] ], [ [[C:%.*]], [[ELSE]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i1 %b to i32
+  call void @use(i32 %b.ext)
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp eq i32 %phi, 0
+  ret i1 %cmp
+}
+
+define i1 @test_zext_icmp_eq_0_not_bool(i1 %a, i2 %b, i32 %c) {
+; CHECK-LABEL: @test_zext_icmp_eq_0_not_bool(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[A:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[B_EXT:%.*]] = zext i2 [[B:%.*]] to i32
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[B_EXT]], [[IF]] ], [ [[C:%.*]], [[ELSE]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+entry:
+  br i1 %a, label %if, label %else
+
+if:
+  %b.ext = zext i2 %b to i32
+  br label %join
+
+else:
+  br label %join
+
+join:
+  %phi = phi i32 [ %b.ext, %if ], [ %c, %else ]
+  %cmp = icmp eq i32 %phi, 0
+  ret i1 %cmp
+}

From 4b9b1158fd274715fa27442b25fc07dd6914e177 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Mon, 6 Jan 2025 17:38:46 +0300
Subject: [PATCH 46/49] [clang][NFC] Fill in historical data for C++ DRs with
 'yes' availability

---
 clang/test/CXX/drs/cwg0xx.cpp  |  68 ++---
 clang/test/CXX/drs/cwg14xx.cpp |   2 +-
 clang/test/CXX/drs/cwg15xx.cpp |   2 +-
 clang/test/CXX/drs/cwg17xx.cpp |   4 +-
 clang/test/CXX/drs/cwg18xx.cpp |   4 +-
 clang/test/CXX/drs/cwg19xx.cpp |   4 +-
 clang/test/CXX/drs/cwg1xx.cpp  |  74 ++---
 clang/test/CXX/drs/cwg20xx.cpp |   2 +-
 clang/test/CXX/drs/cwg21xx.cpp |   4 +-
 clang/test/CXX/drs/cwg22xx.cpp |   2 +-
 clang/test/CXX/drs/cwg23xx.cpp |   2 +-
 clang/test/CXX/drs/cwg273.cpp  |   2 +-
 clang/test/CXX/drs/cwg27xx.cpp |   2 +-
 clang/test/CXX/drs/cwg2xx.cpp  |  54 ++--
 clang/test/CXX/drs/cwg3xx.cpp  |  53 ++--
 clang/test/CXX/drs/cwg4xx.cpp  |  92 +++----
 clang/test/CXX/drs/cwg5xx.cpp  |  75 ++---
 clang/test/CXX/drs/cwg6xx.cpp  |  76 +++---
 clang/test/CXX/drs/cwg7xx.cpp  |   2 +-
 clang/test/CXX/drs/cwg9xx.cpp  |   4 +-
 clang/www/cxx_dr_status.html   | 482 ++++++++++++++++-----------------
 21 files changed, 511 insertions(+), 499 deletions(-)

diff --git a/clang/test/CXX/drs/cwg0xx.cpp b/clang/test/CXX/drs/cwg0xx.cpp
index e9de2347f4fbc..15f469440c66f 100644
--- a/clang/test/CXX/drs/cwg0xx.cpp
+++ b/clang/test/CXX/drs/cwg0xx.cpp
@@ -56,7 +56,7 @@ namespace cwg1 { // cwg1: no
   }
 } // namespace cwg1
 
-namespace cwg3 { // cwg3: yes
+namespace cwg3 { // cwg3: 2.7
   template<typename T> struct A {};
   template<typename T> void f(T) { A<T> a; } // #cwg3-f-T
   template void f(int);
@@ -156,7 +156,7 @@ namespace cwg10 { // cwg10: dup 45
   };
 } // namespace cwg10
 
-namespace cwg11 { // cwg11: yes
+namespace cwg11 { // cwg11: 2.7
   template<typename T> struct A : T {
     using typename T::U;
     U u;
@@ -221,7 +221,7 @@ namespace cwg14 { // cwg14: 3.4
   //   expected-note@#cwg14-Y-U {{candidate found by name lookup is 'cwg14::Y::U'}}
 } // namespace cwg14
 
-namespace cwg15 { // cwg15: yes
+namespace cwg15 { // cwg15: 2.7
   template<typename T> void f(int); // #cwg15-f-decl-first
   template<typename T> void f(int = 0);
   // expected-error@-1 {{default arguments cannot be added to a function template that has already been declared}}
@@ -250,7 +250,7 @@ namespace cwg16 { // cwg16: 2.8
   };
 } // namespace cwg16
 
-namespace cwg17 { // cwg17: yes
+namespace cwg17 { // cwg17: 2.7
   class A {
     int n;
     int f();
@@ -311,7 +311,7 @@ namespace cwg22 { // cwg22: sup 481
   template<typename T = T> struct Y;
 } // namespace cwg22
 
-namespace cwg23 { // cwg23: yes
+namespace cwg23 { // cwg23: 2.7
   template<typename T> void f(T, T); // #cwg23-f-T-T
   template<typename T> void f(T, int); // #cwg23-f-T-int
   void g() { f(0, 0); }
@@ -322,7 +322,7 @@ namespace cwg23 { // cwg23: yes
 
 // cwg24: na
 
-namespace cwg25 { // cwg25: yes
+namespace cwg25 { // cwg25: 4
   struct A {
     void f() throw(int);
     // since-cxx17-error@-1 {{ISO C++17 does not allow dynamic exception specifications}}
@@ -357,7 +357,7 @@ namespace cwg25 { // cwg25: yes
   }
 } // namespace cwg25
 
-namespace cwg26 { // cwg26: yes
+namespace cwg26 { // cwg26: 2.7
   struct A { A(A, const A & = A()); };
   // expected-error@-1 {{copy constructor must pass its first argument by reference}}
   struct B {
@@ -377,7 +377,7 @@ namespace cwg26 { // cwg26: yes
   };
 } // namespace cwg26
 
-namespace cwg27 { // cwg27: yes
+namespace cwg27 { // cwg27: 2.7
   enum E { e } n;
   E &m = true ? n : n;
 } // namespace cwg27
@@ -623,7 +623,7 @@ namespace example4 {
 
 // cwg37: sup 475
 
-namespace cwg38 { // cwg38: yes
+namespace cwg38 { // cwg38: 2.7
   template<typename T> struct X {};
   template<typename T> X<T> operator+(X<T> a, X<T> b) { return a; }
   template X<int> operator+<int>(X<int>, X<int>);
@@ -720,11 +720,11 @@ namespace cwg39 { // cwg39: no
 
 // cwg40: na
 
-namespace cwg41 { // cwg41: yes
+namespace cwg41 { // cwg41: 2.7
   struct S f(S);
 } // namespace cwg41
 
-namespace cwg42 { // cwg42: yes
+namespace cwg42 { // cwg42: 2.7
   struct A { static const int k = 0; };
   struct B : A { static const int k = A::k; };
 } // namespace cwg42
@@ -738,7 +738,7 @@ namespace cwg44 { // cwg44: sup 727
   };
 } // namespace cwg44
 
-namespace cwg45 { // cwg45: yes
+namespace cwg45 { // cwg45: 2.7
   class A {
     class B {};
     class C : B {};
@@ -746,7 +746,7 @@ namespace cwg45 { // cwg45: yes
   };
 } // namespace cwg45
 
-namespace cwg46 { // cwg46: yes
+namespace cwg46 { // cwg46: 2.7
   template<typename> struct A { template<typename> struct B {}; };
   template template struct A<int>::B<int>;
   // expected-error@-1 {{expected unqualified-id}}
@@ -766,7 +766,7 @@ namespace cwg47 { // cwg47: sup 329
   void g() { f(); }
 } // namespace cwg47
 
-namespace cwg48 { // cwg48: yes
+namespace cwg48 { // cwg48: 2.7
   namespace {
     struct S {
       static const int m = 0;
@@ -808,7 +808,7 @@ namespace cwg49 { // cwg49: 2.8
   //   since-cxx17-note@#cwg49-q {{declared here}}
 } // namespace cwg49
 
-namespace cwg50 { // cwg50: yes
+namespace cwg50 { // cwg50: 2.7
   struct X; // #cwg50-X
   extern X *p;
   X *q = (X*)p;
@@ -842,7 +842,7 @@ namespace cwg52 { // cwg52: 2.8
   //   expected-note@#cwg52-B {{declared private here}}
 } // namespace cwg52
 
-namespace cwg53 { // cwg53: yes
+namespace cwg53 { // cwg53: 2.7
   int n = 0;
   enum E { e } x = static_cast<E>(n);
 } // namespace cwg53
@@ -901,12 +901,12 @@ namespace cwg54 { // cwg54: 2.8
   // expected-error@-1 {{conversion from pointer to member of class 'cwg54::V' to pointer to member of class 'B' via virtual base 'cwg54::V' is not allowed}}
 } // namespace cwg54
 
-namespace cwg55 { // cwg55: yes
+namespace cwg55 { // cwg55: 2.7
   enum E { e = 5 };
   static_assert(e + 1 == 6, "");
 } // namespace cwg55
 
-namespace cwg56 { // cwg56: yes
+namespace cwg56 { // cwg56: 2.7
   struct A {
     typedef int T; // #cwg56-typedef-int-T-first
     typedef int T;
@@ -933,7 +933,7 @@ namespace cwg58 { // cwg58: 3.1
 #endif
 } // namespace cwg58
 
-namespace cwg59 { // cwg59: yes
+namespace cwg59 { // cwg59: 2.7
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wdeprecated-volatile"
   template<typename T> struct convert_to { operator T() const; };
@@ -990,7 +990,7 @@ namespace cwg59 { // cwg59: yes
 #pragma clang diagnostic pop
 } // namespace cwg59
 
-namespace cwg60 { // cwg60: yes
+namespace cwg60 { // cwg60: 2.7
   void f(int &);
   int &f(...);
   const int k = 0;
@@ -1067,13 +1067,13 @@ namespace cwg62 { // cwg62: 2.9
   }
 } // namespace cwg62
 
-namespace cwg63 { // cwg63: yes
+namespace cwg63 { // cwg63: 2.7
   template<typename T> struct S { typename T::error e; };
   extern S<int> *p;
   void *q = p;
 } // namespace cwg63
 
-namespace cwg64 { // cwg64: yes
+namespace cwg64 { // cwg64: 2.7
   template<class T> void f(T);
   template<class T> void f(T*);
   template<> void f(int*);
@@ -1136,7 +1136,7 @@ namespace cwg69 { // cwg69: 9
   //   cxx98-note@#cwg69-f {{non-type template argument refers to function here}}
 } // namespace cwg69
 
-namespace cwg70 { // cwg70: yes
+namespace cwg70 { // cwg70: 2.7
   template<int> struct A {};
   template<int I, int J> int f(int (&)[I + J], A<I>, A<J>);
   int arr[7];
@@ -1155,26 +1155,26 @@ namespace cwg73 { // cwg73: sup 1652
 #endif
 } // namespace cwg73
 
-namespace cwg74 { // cwg74: yes
+namespace cwg74 { // cwg74: 2.7
   enum E { k = 5 };
   int (*p)[k] = new int[k][k];
 } // namespace cwg74
 
-namespace cwg75 { // cwg75: yes
+namespace cwg75 { // cwg75: 2.7
   struct S {
     static int n = 0;
     // expected-error@-1 {{non-const static data member must be initialized out of line}}
   };
 } // namespace cwg75
 
-namespace cwg76 { // cwg76: yes
+namespace cwg76 { // cwg76: 2.7
   const volatile int n = 1;
   static_assert(n, "");
   // expected-error@-1 {{static assertion expression is not an integral constant expression}}
   //   expected-note@-2 {{read of volatile-qualified type 'const volatile int' is not allowed in a constant expression}}
 } // namespace cwg76
 
-namespace cwg77 { // cwg77: yes
+namespace cwg77 { // cwg77: 2.7
   struct A {
     struct B {};
     friend struct B;
@@ -1213,13 +1213,13 @@ namespace cwg80 { // cwg80: 2.9
 // cwg81: na
 // cwg82: dup 48
 
-namespace cwg83 { // cwg83: yes
+namespace cwg83 { // cwg83: 2.7
   int &f(const char*);
   char &f(char *);
   int &k = f("foo");
 } // namespace cwg83
 
-namespace cwg84 { // cwg84: yes
+namespace cwg84 { // cwg84: 2.7
   struct B;
   struct A { operator B() const; };
   struct C {};
@@ -1299,7 +1299,7 @@ namespace cwg88 { // cwg88: 2.8
 
 // cwg89: na
 
-namespace cwg90 { // cwg90: yes
+namespace cwg90 { // cwg90: 2.7
   struct A {
     template<typename T> friend void cwg90_f(T);
   };
@@ -1333,7 +1333,7 @@ namespace cwg90 { // cwg90: yes
   }
 } // namespace cwg90
 
-namespace cwg91 { // cwg91: yes
+namespace cwg91 { // cwg91: 2.7
   union U { friend int f(U); };
   int k = f(U());
 } // namespace cwg91
@@ -1383,7 +1383,7 @@ namespace cwg92 { // cwg92: 4 c++17
 
 // cwg93: na
 
-namespace cwg94 { // cwg94: yes
+namespace cwg94 { // cwg94: 2.7
   struct A { static const int n = 5; };
   int arr[A::n];
 } // namespace cwg94
@@ -1426,14 +1426,14 @@ namespace cwg96 { // cwg96: sup P1787
   }
 } // namespace cwg96
 
-namespace cwg97 { // cwg97: yes
+namespace cwg97 { // cwg97: 2.7
   struct A {
     static const int a = false;
     static const int b = !a;
   };
 } // namespace cwg97
 
-namespace cwg98 { // cwg98: yes
+namespace cwg98 { // cwg98: 2.7
   void test(int n) {
     switch (n) {
       try { // #cwg98-try
diff --git a/clang/test/CXX/drs/cwg14xx.cpp b/clang/test/CXX/drs/cwg14xx.cpp
index 8f9de5373e757..51bc9614299a5 100644
--- a/clang/test/CXX/drs/cwg14xx.cpp
+++ b/clang/test/CXX/drs/cwg14xx.cpp
@@ -78,7 +78,7 @@ namespace cwg1432 { // cwg1432: 16
 #endif
 } // namespace cwg1432
 
-namespace cwg1443 { // cwg1443: yes
+namespace cwg1443 { // cwg1443: 2.7
 struct A {
   int i;
   A() { void foo(int=i); }
diff --git a/clang/test/CXX/drs/cwg15xx.cpp b/clang/test/CXX/drs/cwg15xx.cpp
index d10890ee3fd15..30ec63999ca28 100644
--- a/clang/test/CXX/drs/cwg15xx.cpp
+++ b/clang/test/CXX/drs/cwg15xx.cpp
@@ -355,7 +355,7 @@ namespace cwg1560 { // cwg1560: 3.5
   const X &x = true ? get() : throw 0;
 } // namespace cwg1560
 
-namespace cwg1563 { // cwg1563: yes
+namespace cwg1563 { // cwg1563: 3.1
 #if __cplusplus >= 201103L
   double bar(double) { return 0.0; }
   float bar(float) { return 0.0f; }
diff --git a/clang/test/CXX/drs/cwg17xx.cpp b/clang/test/CXX/drs/cwg17xx.cpp
index 04bf637543a29..8c4f916a606a4 100644
--- a/clang/test/CXX/drs/cwg17xx.cpp
+++ b/clang/test/CXX/drs/cwg17xx.cpp
@@ -225,8 +225,8 @@ namespace cwg1778 { // cwg1778: 9
 
 // cwg1779 is in cwg177x.cpp
 
-namespace cwg1794 { // cwg1794: yes
-                   // NB: dup 1710
+namespace cwg1794 { // cwg1794: 2.7
+                    // NB: dup 1710
 #if __cplusplus >= 201103L
 template <template <typename> class Template> struct Internal {
   template <typename Arg> using Bind = Template<Arg>;
diff --git a/clang/test/CXX/drs/cwg18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp
index e085b38eb3127..626473f11d3ec 100644
--- a/clang/test/CXX/drs/cwg18xx.cpp
+++ b/clang/test/CXX/drs/cwg18xx.cpp
@@ -198,7 +198,7 @@ namespace cwg1813 { // cwg1813: 7
   static_assert(!__is_standard_layout(U), "");
 }
 
-namespace cwg1814 { // cwg1814: yes
+namespace cwg1814 { // cwg1814: 3.1
 #if __cplusplus >= 201103L
   void test() {
     auto lam = [](int x = 42) { return x; };
@@ -296,7 +296,7 @@ struct A {
 };
 } // namespace cwg1821
 
-namespace cwg1822 { // cwg1822: yes
+namespace cwg1822 { // cwg1822: 3.1
 #if __cplusplus >= 201103L
   double a;
   auto x = [] (int a) {
diff --git a/clang/test/CXX/drs/cwg19xx.cpp b/clang/test/CXX/drs/cwg19xx.cpp
index a01082a440278..55a7c7cbba66f 100644
--- a/clang/test/CXX/drs/cwg19xx.cpp
+++ b/clang/test/CXX/drs/cwg19xx.cpp
@@ -6,7 +6,9 @@
 // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 
-namespace std { struct type_info; }
+namespace std {
+struct type_info;
+} // namespace std
 
 namespace cwg1900 { // cwg1900: 2.7
 // See the test for CWG1477 for detailed analysis
diff --git a/clang/test/CXX/drs/cwg1xx.cpp b/clang/test/CXX/drs/cwg1xx.cpp
index f968ab9c20b27..98eb86c929009 100644
--- a/clang/test/CXX/drs/cwg1xx.cpp
+++ b/clang/test/CXX/drs/cwg1xx.cpp
@@ -17,7 +17,7 @@
 #define __enable_constant_folding
 #endif
 
-namespace cwg100 { // cwg100: yes
+namespace cwg100 { // cwg100: 2.7
   template<const char (*)[4]> struct A {}; // #cwg100-A
   template<const char (&)[4]> struct B {}; // #cwg100-B
   template<const char *> struct C {}; // #cwg100-C
@@ -53,7 +53,7 @@ namespace cwg101 { // cwg101: 3.5
   typedef unsigned size_t;
 } // namespace cwg101
 
-namespace cwg102 { // cwg102: yes
+namespace cwg102 { // cwg102: 2.7
   namespace A {
     template<typename T> T f(T a, T b) { return a + b; }
     // expected-error@-1 {{call to function 'operator+' that is neither visible in the template definition nor found by argument-dependent lookup}}
@@ -87,7 +87,7 @@ namespace cwg106 { // cwg106: sup 540
   // expected-warning@-1 {{'const' qualifier on reference type 'r2' (aka 'const int &') has no effect}}
 } // namespace cwg106
 
-namespace cwg107 { // cwg107: yes
+namespace cwg107 { // cwg107: 2.7
   struct S {};
   extern "C" S operator+(S, S) { return S(); }
 } // namespace cwg107
@@ -103,7 +103,7 @@ namespace cwg108 { // cwg108: 2.9
   template<> struct A<int>::B { int X; };
 } // namespace cwg108
 
-namespace cwg109 { // cwg109: yes
+namespace cwg109 { // cwg109: 2.8
   struct A { template<typename T> void f(T); };
   template<typename T> struct B : T {
     using T::template f;
@@ -144,7 +144,7 @@ namespace cwg111 { // cwg111: dup 535
   //   expected-note@#cwg111-B {{candidate constructor not viable: requires 0 arguments, but 1 was provided}}
 } // namespace cwg111
 
-namespace cwg112 { // cwg112: yes
+namespace cwg112 { // cwg112: 3.1
   struct T { int n; };
   typedef T Arr[1];
 
@@ -164,7 +164,7 @@ namespace cwg112 { // cwg112: yes
   X<a4> x4;
 } // namespace cwg112
 
-namespace cwg113 { // cwg113: yes
+namespace cwg113 { // cwg113: 2.7
   extern void (*p)();
   void f() {
     no_such_function();
@@ -175,7 +175,7 @@ namespace cwg113 { // cwg113: yes
   void (*p)() = &g;
 } // namespace cwg113
 
-namespace cwg114 { // cwg114: yes
+namespace cwg114 { // cwg114: 2.7
   struct A {
     virtual void f(int) = 0; // #cwg114-A-f
   };
@@ -296,7 +296,7 @@ namespace cwg115 { // cwg115: 3.0
 #endif
 } // namespace cwg115
 
-namespace cwg116 { // cwg116: yes
+namespace cwg116 { // cwg116: 2.7
   template<int> struct A {};
   template<int N> void f(A<N>) {} // #cwg116-f-N
   template<int M> void f(A<M>) {}
@@ -313,7 +313,7 @@ namespace cwg116 { // cwg116: yes
 // cwg119: na
 // cwg120: na
 
-namespace cwg121 { // cwg121: yes
+namespace cwg121 { // cwg121: 2.7
   struct X {
     template<typename T> struct Y {};
   };
@@ -326,7 +326,7 @@ namespace cwg121 { // cwg121: yes
   Z<X> z;
 } // namespace cwg121
 
-namespace cwg122 { // cwg122: yes
+namespace cwg122 { // cwg122: 2.7
   template<typename T> void f();
   void g() { f<int>(); }
 } // namespace cwg122
@@ -334,7 +334,7 @@ namespace cwg122 { // cwg122: yes
 // cwg123: na
 // cwg124 is in cwg124.cpp
 
-// cwg125: yes
+// cwg125: 2.7
 struct cwg125_A { struct cwg125_B {}; }; // #cwg125_B
 cwg125_A::cwg125_B cwg125_C();
 namespace cwg125_B { cwg125_A cwg125_C(); }
@@ -478,7 +478,7 @@ namespace cwg127 { // cwg127: 2.9
   A<int> *q = new ("") A<int>; // #cwg127-q
 } // namespace cwg127
 
-namespace cwg128 { // cwg128: yes
+namespace cwg128 { // cwg128: 2.7
   enum E1 { e1 } x = e1;
   enum E2 { e2 } y = static_cast<E2>(x), z = static_cast<E2>(e1);
 } // namespace cwg128
@@ -503,7 +503,7 @@ namespace cwg132 { // cwg132: no
 // cwg133: dup 87
 // cwg134: na
 
-namespace cwg135 { // cwg135: yes
+namespace cwg135 { // cwg135: 2.7
   struct A {
     A f(A a) { return a; }
     friend A g(A a) { return a; }
@@ -559,7 +559,7 @@ namespace cwg136 { // cwg136: 3.4
   };
 } // namespace cwg136
 
-namespace cwg137 { // cwg137: yes
+namespace cwg137 { // cwg137: 2.7
   extern void *p;
   extern const void *cp;
   extern volatile void *vp;
@@ -657,7 +657,7 @@ struct Data {
 } // namespace example3
 } // namespace cwg138
 
-namespace cwg139 { // cwg139: yes
+namespace cwg139 { // cwg139: 2.7
   namespace example1 {
     typedef int f; // #cwg139-typedef-f
     struct A {
@@ -679,7 +679,7 @@ namespace cwg139 { // cwg139: yes
   }
 } // namespace cwg139
 
-namespace cwg140 { // cwg140: yes
+namespace cwg140 { // cwg140: 2.7
   void f(int *const) {} // #cwg140-f-first
   void f(int[3]) {}
   // expected-error@-1 {{redefinition of 'f'}}
@@ -783,7 +783,7 @@ namespace cwg142 { // cwg142: 2.8
   }
 } // namespace cwg142
 
-namespace cwg143 { // cwg143: yes
+namespace cwg143 { // cwg143: 2.7
   namespace A { struct X; }
   namespace B { void f(A::X); }
   namespace A {
@@ -795,7 +795,7 @@ namespace cwg143 { // cwg143: yes
   }
 } // namespace cwg143
 
-namespace cwg145 { // cwg145: yes
+namespace cwg145 { // cwg145: 2.7
   void f(bool b) {
     ++b;
     // cxx98-14-warning@-1 {{incrementing expression of type bool is deprecated and incompatible with C++17}}
@@ -806,7 +806,7 @@ namespace cwg145 { // cwg145: yes
   }
 } // namespace cwg145
 
-namespace cwg147 { // cwg147: yes
+namespace cwg147 { // cwg147: 2.7
   namespace example1 {
     template<typename> struct A {
       template<typename T> A(T);
@@ -834,7 +834,7 @@ namespace cwg147 { // cwg147: yes
   }
 } // namespace cwg147
 
-namespace cwg148 { // cwg148: yes
+namespace cwg148 { // cwg148: 2.7
   struct A { int A::*p; };
   static_assert(__is_pod(int(A::*)), "");
   static_assert(__is_pod(A), "");
@@ -888,7 +888,7 @@ namespace cwg151 { // cwg151: 3.1
   static_assert(__enable_constant_folding(p() == 0), "");
 } // namespace cwg151
 
-namespace cwg152 { // cwg152: yes
+namespace cwg152 { // cwg152: 2.7
   struct A {
     A(); // #cwg152-A-ctor
     explicit A(const A&); // #cwg152-A-explicit-ctor
@@ -909,7 +909,7 @@ namespace cwg152 { // cwg152: yes
 
 // cwg153: na
 
-namespace cwg154 { // cwg154: yes
+namespace cwg154 { // cwg154: 2.7
   union { int a; };
   // expected-error@-1 {{nonymous unions at namespace or global scope must be declared 'static'}}
   namespace {
@@ -992,7 +992,7 @@ namespace cwg162 { // cwg162: 19
 
 // cwg163: na
 
-namespace cwg164 { // cwg164: yes
+namespace cwg164 { // cwg164: 2.7
   void f(int);
   template <class T> int g(T t) { return f(t); }
 
@@ -1057,7 +1057,7 @@ namespace cwg168 { // cwg168: no
   q b = &S::f;
 } // namespace cwg168
 
-namespace cwg169 { // cwg169: yes
+namespace cwg169 { // cwg169: 3.4
   template<typename> struct A { int n; };
   struct B {
     template<typename> struct C;
@@ -1087,7 +1087,7 @@ namespace cwg171 {
   //   expected-note@#cwg171b-int {{declared in global scope here}}
 } // namespace cwg171
 
-namespace cwg172 { // cwg172: yes
+namespace cwg172 { // cwg172: 2.7
   enum { zero };
   static_assert(-1 < zero, "");
 
@@ -1121,7 +1121,7 @@ namespace cwg172 { // cwg172: yes
   static_assert(-f > 0, "");
 } // namespace cwg172
 
-namespace cwg173 { // cwg173: yes
+namespace cwg173 { // cwg173: 2.7
   static_assert('0' + 1 == '1' && '0' + 2 == '2' && '0' + 3 == '3' &&
                 '0' + 4 == '4' && '0' + 5 == '5' && '0' + 6 == '6' &&
                 '0' + 7 == '7' && '0' + 8 == '8' && '0' + 9 == '9', "");
@@ -1181,7 +1181,7 @@ namespace cwg176 { // cwg176: 3.1
   };
 } // namespace cwg176
 
-namespace cwg177 { // cwg177: yes
+namespace cwg177 { // cwg177: 2.7
   struct B {};
   struct A {
     A(A &); // #cwg177-A-copy-ctor
@@ -1202,7 +1202,7 @@ namespace cwg177 { // cwg177: yes
   //   expected-note@#cwg177-C-copy-ctor {{candidate constructor not viable: expects an lvalue for 1st argument}}
 } // namespace cwg177
 
-namespace cwg178 { // cwg178: yes
+namespace cwg178 { // cwg178: 3.1
   static_assert(int() == 0, "");
 #if __cplusplus >= 201103L
   static_assert(int{} == 0, "");
@@ -1215,7 +1215,7 @@ namespace cwg178 { // cwg178: yes
 #endif
 } // namespace cwg178
 
-namespace cwg179 { // cwg179: yes
+namespace cwg179 { // cwg179: 2.7
   void f();
   int n = &f - &f;
   // expected-error@-1 {{arithmetic on pointers to the function type 'void ()'}}
@@ -1231,7 +1231,7 @@ namespace cwg180 { // cwg180: 2.8
   };
 } // namespace cwg180
 
-namespace cwg181 { // cwg181: yes
+namespace cwg181 { // cwg181: 2.7
   namespace X {
     template <template X<class T> > struct A { };
     // expected-error@-1 +{{}}
@@ -1279,7 +1279,7 @@ namespace cwg183 { // cwg183: sup 382
   };
 } // namespace cwg183
 
-namespace cwg184 { // cwg184: yes
+namespace cwg184 { // cwg184: 2.7
   template<typename T = float> struct B {};
 
   template<template<typename TT = float> class T> struct A {
@@ -1311,7 +1311,7 @@ namespace cwg187 { // cwg187: sup 481
   typedef A<1, 1> T;
 } // namespace cwg187
 
-namespace cwg188 { // cwg188: yes
+namespace cwg188 { // cwg188: 2.7
   char c[10];
   static_assert(sizeof(0, c) == 10, "");
 } // namespace cwg188
@@ -1337,7 +1337,7 @@ static_assert(__is_layout_compatible(A, B), "");
 } // namespace cwg190
 
 int cwg191_j;
-namespace cwg191 { // cwg191: yes
+namespace cwg191 { // cwg191: 2.7
   namespace example1 {
     struct outer {
       static int i;
@@ -1376,7 +1376,7 @@ struct S {
 
 // cwg193 is in cwg193.cpp
 
-namespace cwg194 { // cwg194: yes
+namespace cwg194 { // cwg194: 2.7
   struct A {
     A();
     void A();
@@ -1392,7 +1392,7 @@ namespace cwg194 { // cwg194: yes
   };
 } // namespace cwg194
 
-namespace cwg195 { // cwg195: yes
+namespace cwg195 { // cwg195: 2.7
   void f();
   int *p = (int*)&f;
   // cxx98-error@-1 {{cast between pointer-to-function and pointer-to-object is an extension}}
@@ -1400,7 +1400,7 @@ namespace cwg195 { // cwg195: yes
   // cxx98-error@-1 {{cast between pointer-to-function and pointer-to-object is an extension}}
 } // namespace cwg195
 
-namespace cwg197 { // cwg197: yes
+namespace cwg197 { // cwg197: 2.7
   char &f(char);
 
   template <class T> void g(T t) {
@@ -1424,7 +1424,7 @@ namespace cwg197 { // cwg197: yes
   }
 } // namespace cwg197
 
-namespace cwg198 { // cwg198: yes
+namespace cwg198 { // cwg198: 2.9
   struct A {
     int n;
     struct B {
diff --git a/clang/test/CXX/drs/cwg20xx.cpp b/clang/test/CXX/drs/cwg20xx.cpp
index d53d972306d68..141a1012aef93 100644
--- a/clang/test/CXX/drs/cwg20xx.cpp
+++ b/clang/test/CXX/drs/cwg20xx.cpp
@@ -99,7 +99,7 @@ static_assert(__is_same(decltype(a), decltype(b)));
 #endif
 } // namespace cwg2049
 
-namespace cwg2061 { // cwg2061: yes
+namespace cwg2061 { // cwg2061: 2.7
 #if __cplusplus >= 201103L
   namespace A {
     inline namespace b {
diff --git a/clang/test/CXX/drs/cwg21xx.cpp b/clang/test/CXX/drs/cwg21xx.cpp
index 2f337afd0e867..42a7c4d7bbded 100644
--- a/clang/test/CXX/drs/cwg21xx.cpp
+++ b/clang/test/CXX/drs/cwg21xx.cpp
@@ -66,7 +66,7 @@ namespace cwg2100 { // cwg2100: 12
   };
 } // namespace cwg2100
 
-namespace cwg2103 { // cwg2103: yes
+namespace cwg2103 { // cwg2103: 2.7
   void f() {
     int a;
     int &r = a; // #cwg2103-r
@@ -287,7 +287,7 @@ static_assert(!noexcept(typeid(*static_cast<D*>(nullptr))), "");
 #endif
 } // namespace cwg2191
 
-namespace cwg2180 { // cwg2180: yes
+namespace cwg2180 { // cwg2180: 3.0
   class A {
     A &operator=(const A &); // #cwg2180-A-copy
     A &operator=(A &&); // #cwg2180-A-move
diff --git a/clang/test/CXX/drs/cwg22xx.cpp b/clang/test/CXX/drs/cwg22xx.cpp
index 46ae73212b243..d93070ef3804d 100644
--- a/clang/test/CXX/drs/cwg22xx.cpp
+++ b/clang/test/CXX/drs/cwg22xx.cpp
@@ -19,7 +19,7 @@ void f() {
 #endif
 } // namespace cwg2211
 
-namespace cwg2213 { // cwg2213: yes
+namespace cwg2213 { // cwg2213: 2.7
 template <typename T, typename U>
 struct A;
 
diff --git a/clang/test/CXX/drs/cwg23xx.cpp b/clang/test/CXX/drs/cwg23xx.cpp
index 7ab66ce89f286..48db83107d76a 100644
--- a/clang/test/CXX/drs/cwg23xx.cpp
+++ b/clang/test/CXX/drs/cwg23xx.cpp
@@ -333,7 +333,7 @@ namespace cwg2358 { // cwg2358: 16
 
 // CWG2363 was closed as NAD, but its resolution does affirm that
 // a friend declaration cannot have an opaque-enumm-specifier.
-namespace cwg2363 { // cwg2363: yes
+namespace cwg2363 { // cwg2363: 19
 #if __cplusplus >= 201103L
 enum class E0;
 enum E1 : int;
diff --git a/clang/test/CXX/drs/cwg273.cpp b/clang/test/CXX/drs/cwg273.cpp
index 934ebdb919a4b..532f1a1abb735 100644
--- a/clang/test/CXX/drs/cwg273.cpp
+++ b/clang/test/CXX/drs/cwg273.cpp
@@ -10,7 +10,7 @@
 
 #include <stdarg.h>
 #include <stddef.h>
-namespace cwg273 { // cwg273: yes
+namespace cwg273 { // cwg273: 2.7
   struct A {
     int n;
   };
diff --git a/clang/test/CXX/drs/cwg27xx.cpp b/clang/test/CXX/drs/cwg27xx.cpp
index b5e2368d921fb..7caf36a9f23b2 100644
--- a/clang/test/CXX/drs/cwg27xx.cpp
+++ b/clang/test/CXX/drs/cwg27xx.cpp
@@ -174,7 +174,7 @@ static_assert(!__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion3)
 #endif
 } // namespace cwg2759
 
-namespace cwg2770 { // cwg2770: 20
+namespace cwg2770 { // cwg2770: 20 open 2023-07-14
 #if __cplusplus >= 202002L
 template<typename T>
 struct B {
diff --git a/clang/test/CXX/drs/cwg2xx.cpp b/clang/test/CXX/drs/cwg2xx.cpp
index 9ac93d78d747e..dde3369b41f77 100644
--- a/clang/test/CXX/drs/cwg2xx.cpp
+++ b/clang/test/CXX/drs/cwg2xx.cpp
@@ -44,7 +44,7 @@ namespace cwg202 { // cwg202: 3.1
 
 // cwg204: sup 820
 
-namespace cwg206 { // cwg206: yes
+namespace cwg206 { // cwg206: 2.7
   struct S; // #cwg206-S
   template<typename T> struct Q { S s; };
   // expected-error@-1 {{field has incomplete type 'S'}}
@@ -54,7 +54,7 @@ namespace cwg206 { // cwg206: yes
   //   expected-note@#cwg206-S {{forward declaration of 'cwg206::S'}}
 } // namespace cwg206
 
-namespace cwg207 { // cwg207: yes
+namespace cwg207 { // cwg207: 2.7
   class A {
   protected:
     static void f() {}
@@ -84,7 +84,7 @@ namespace cwg209 { // cwg209: 3.2
 
 // cwg210 is in cwg210.cpp
 
-namespace cwg211 { // cwg211: yes
+namespace cwg211 { // cwg211: 2.7
   struct A {
     A() try {
       throw 0;
@@ -95,7 +95,7 @@ namespace cwg211 { // cwg211: yes
   };
 } // namespace cwg211
 
-namespace cwg213 { // cwg213: yes
+namespace cwg213 { // cwg213: 2.7
   template <class T> struct A : T {
     void h(T t) {
       char &r1 = f(t);
@@ -114,7 +114,7 @@ namespace cwg213 { // cwg213: yes
   template void A<B>::h(B); // #cwg213-instantiation
 } // namespace cwg213
 
-namespace cwg214 { // cwg214: yes
+namespace cwg214 { // cwg214: 2.7
   template<typename T, typename U> T checked_cast(U from) { U::error; }
   template<typename T, typename U> T checked_cast(U *from);
   class C {};
@@ -153,7 +153,7 @@ namespace cwg216 { // cwg216: no
   void g(S s, S::E e) { s.f(e); }
 } // namespace cwg216
 
-namespace cwg217 { // cwg217: yes
+namespace cwg217 { // cwg217: 2.7
   template<typename T> struct S {
     void f(int);
   };
@@ -161,7 +161,7 @@ namespace cwg217 { // cwg217: yes
   // expected-error@-1 {{default arguments cannot be added to an out-of-line definition of a member of a class template}}
 } // namespace cwg217
 
-namespace cwg218 { // cwg218: yes
+namespace cwg218 { // cwg218: 2.7
                   // NB: also dup 405
   namespace A {
     struct S {};
@@ -412,14 +412,16 @@ namespace cwg226 { // cwg226: no
   int x = foo(0, 0);
 } // namespace cwg226
 
-void cwg227(bool b) { // cwg227: yes
+namespace cwg227 { // cwg227: 2.7
+void f(bool b) {
   if (b)
     int n;
   else
     int n;
 }
+} // namespace cwg227
 
-namespace cwg228 { // cwg228: yes
+namespace cwg228 { // cwg228: 2.7
   template <class T> struct X {
     void f();
   };
@@ -444,7 +446,7 @@ namespace cwg230 { // cwg230: 3.0
   };
 } // namespace cwg230
 
-namespace cwg231 { // cwg231: yes
+namespace cwg231 { // cwg231: 2.7
   namespace outer {
     namespace inner {
       int i; // #cwg231-i
@@ -471,7 +473,7 @@ namespace cwg237 { // cwg237: dup 470
   template struct B<int>; // ok
 } // namespace cwg237
 
-namespace cwg239 { // cwg239: yes
+namespace cwg239 { // cwg239: 2.7
   namespace NS {
     class T {};
     void f(T);
@@ -489,7 +491,7 @@ namespace cwg239 { // cwg239: yes
 
 // cwg240: dup 616
 
-namespace cwg241 { // cwg241: yes
+namespace cwg241 { // cwg241: 9
   namespace A {
     struct B {};
     template <int X> void f(); // #cwg241-A-f
@@ -526,7 +528,7 @@ namespace cwg241 { // cwg241: yes
   }
 } // namespace cwg241
 
-namespace cwg243 { // cwg243: yes
+namespace cwg243 { // cwg243: 2.8
   struct B;
   struct A {
     A(B); // #cwg243-A
@@ -636,7 +638,7 @@ namespace cwg244 { // cwg244: 11
   }
 } // namespace cwg244
 
-namespace cwg245 { // cwg245: yes
+namespace cwg245 { // cwg245: 2.8
   struct S {
     enum E {}; // #cwg245-E
     class E *p;
@@ -658,7 +660,7 @@ X: ;
   };
 } // namespace cwg246
 
-namespace cwg247 { // cwg247: yes
+namespace cwg247 { // cwg247: 2.7
   struct A {};
   struct B : A {
     void f();
@@ -688,12 +690,12 @@ namespace cwg248 { // cwg248: sup P1949
   int \u040d\u040e = 0;
 } // namespace cwg248
 
-namespace cwg249 { // cwg249: yes
+namespace cwg249 { // cwg249: 2.7
   template<typename T> struct X { void f(); };
   template<typename T> void X<T>::f() {}
 } // namespace cwg249
 
-namespace cwg250 { // cwg250: yes
+namespace cwg250 { // cwg250: 2.7
   typedef void (*FPtr)(double x[]);
 
   template<int I> void f(double x[]);
@@ -770,10 +772,10 @@ namespace cwg254 { // cwg254: 2.9
   A<C>::type n; // #cwg254-instantiation
 } // namespace cwg254
 
-namespace cwg255 { // cwg255: yes
+namespace cwg255 { // cwg255: 2.7
 struct S {
-  void operator delete(void *){};
-  void operator delete(void *, int){};
+  void operator delete(void *){}
+  void operator delete(void *, int){}
 };
 void f(S *p) { delete p; }
 } // namespace cwg255
@@ -907,7 +909,7 @@ namespace cwg261 { // cwg261: no
 #pragma clang diagnostic pop
 } // namespace cwg261
 
-namespace cwg262 { // cwg262: yes
+namespace cwg262 { // cwg262: 2.7
   int f(int = 0, ...);
   int k = f();
   int l = f(0);
@@ -939,7 +941,7 @@ namespace cwg263 { // cwg263: 3.3
 // cwg269: na
 // cwg270: na
 
-namespace cwg272 { // cwg272: yes
+namespace cwg272 { // cwg272: 2.7
   struct X {
     void f() {
       this->~X();
@@ -1091,7 +1093,7 @@ namespace cwg281 { // cwg281: no
   };
 } // namespace cwg281
 
-namespace cwg283 { // cwg283: yes
+namespace cwg283 { // cwg283: 2.7
   template<typename T> // #cwg283-template
   struct S {
     friend class T;
@@ -1141,7 +1143,7 @@ namespace cwg284 { // cwg284: no
   class D::Z z2; // ok per cwg417
 } // namespace cwg284
 
-namespace cwg285 { // cwg285: yes
+namespace cwg285 { // cwg285: 2.7
   template<typename T> void f(T, int); // #cwg285-f-T-int
   template<typename T> void f(int, T); // #cwg285-f-int-T
   template<> void f<int>(int, int) {}
@@ -1168,7 +1170,7 @@ namespace cwg286 { // cwg286: 2.8
 
 // cwg288: na
 
-namespace cwg289 { // cwg289: yes
+namespace cwg289 { // cwg289: 2.7
   struct A; // #cwg289-A
   struct B : A {};
   // expected-error@-1 {{base class has incomplete type}}
@@ -1231,7 +1233,7 @@ namespace cwg295 { // cwg295: 3.7
   // expected-warning@-1 {{'volatile' qualifier on function type 'U' (aka 'int ()') has no effect}}
 } // namespace cwg295
 
-namespace cwg296 { // cwg296: yes
+namespace cwg296 { // cwg296: 2.7
   struct A {
     static operator int() { return 0; }
     // expected-error@-1 {{conversion function must be a non-static member function}}
diff --git a/clang/test/CXX/drs/cwg3xx.cpp b/clang/test/CXX/drs/cwg3xx.cpp
index d7319d465a93d..b5e07a66bb4ed 100644
--- a/clang/test/CXX/drs/cwg3xx.cpp
+++ b/clang/test/CXX/drs/cwg3xx.cpp
@@ -17,7 +17,7 @@
 #define __enable_constant_folding
 #endif
 
-namespace cwg300 { // cwg300: yes
+namespace cwg300 { // cwg300: 2.7
   template<typename R, typename A> void f(R (&)(A)) {}
   int g(int);
   void h() { f(g); }
@@ -337,7 +337,7 @@ namespace cwg319 { // cwg319: no
   }
 } // namespace cwg319
 
-namespace cwg320 { // cwg320: yes
+namespace cwg320 { // cwg320: 3.1
 #if __cplusplus >= 201103L
   struct X {
     constexpr X() {}
@@ -419,7 +419,7 @@ namespace cwg327 { // cwg327: dup 538
   struct B {};
 } // namespace cwg327
 
-namespace cwg328 { // cwg328: yes
+namespace cwg328 { // cwg328: 2.7
   struct A; // #cwg328-A
   struct B { A a; };
   // expected-error@-1 {{field has incomplete type 'A'}}
@@ -569,14 +569,14 @@ namespace cwg332 { // cwg332: dup 577
   // cxx20-23-warning@-2 {{volatile-qualified parameter type 'volatile void' is deprecated}}
 } // namespace cwg332
 
-namespace cwg333 { // cwg333: yes
+namespace cwg333 { // cwg333: 2.7
   int n = 0;
   int f(int(n));
   int g((int(n)));
   int h = f(g);
 } // namespace cwg333
 
-namespace cwg334 { // cwg334: yes
+namespace cwg334 { // cwg334: 2.7
   template<typename T> void f() {
     T x;
     f((x, 123));
@@ -590,7 +590,7 @@ namespace cwg334 { // cwg334: yes
 
 // cwg335: sup 820
 
-namespace cwg336 { // cwg336: yes
+namespace cwg336 { // cwg336: 2.7
   namespace Pre {
     template<class T1> class A {
       template<class T2> class B {
@@ -622,7 +622,7 @@ namespace cwg336 { // cwg336: yes
   }
 } // namespace cwg336
 
-namespace cwg337 { // cwg337: yes
+namespace cwg337 { // cwg337: 2.7
   template<typename T> void f(T (*)[1]);
   template<typename T> int &f(...);
 
@@ -673,7 +673,7 @@ namespace cwg339 { // cwg339: 2.8
   A<1> c = make_A<char>();
 } // namespace cwg339
 
-namespace cwg340 { // cwg340: yes
+namespace cwg340 { // cwg340: 2.7
   struct A { A(int); };
   struct B { B(A, A, int); };
   int x, y;
@@ -734,7 +734,7 @@ namespace cwg344 { // cwg344: dup 1435
   struct B { friend A::~A(); };
 } // namespace cwg344
 
-namespace cwg345 { // cwg345: yes
+namespace cwg345 { // cwg345: 2.7
   struct A {
     struct X {};
     int X; // #cwg345-int-X
@@ -754,7 +754,7 @@ namespace cwg345 { // cwg345: yes
 
 // cwg346: na
 
-namespace cwg347 { // cwg347: yes
+namespace cwg347 { // cwg347: 2.7
   struct base {
     struct nested;
     static int n;
@@ -949,7 +949,7 @@ namespace cwg352 { // cwg352: 2.8
 
 // cwg353 needs an IRGen test.
 
-namespace cwg354 { // cwg354: yes c++11
+namespace cwg354 { // cwg354: 3.1 c++11
   // FIXME: Should we allow this in C++98 too?
   struct S {};
 
@@ -1007,7 +1007,7 @@ namespace cwg354 { // cwg354: yes c++11
   // since-cxx17-error@#cwg354-m3 {{value of type 'int *' is not implicitly convertible to 'int S::*'}}
 } // namespace cwg354
 
-struct cwg355_S; // cwg355: yes
+struct cwg355_S; // cwg355: 2.7
 struct ::cwg355_S {};
 // expected-warning@-1 {{extra qualification on member 'cwg355_S'}}
 namespace cwg355 {
@@ -1016,7 +1016,7 @@ struct ::cwg355_S s;
 
 // cwg356: na
 
-namespace cwg357 { // cwg357: yes
+namespace cwg357 { // cwg357: 2.7
   template<typename T> struct A { // #cwg357-A
     void f() const; // #cwg357-f
   };
@@ -1033,7 +1033,7 @@ namespace cwg357 { // cwg357: yes
   //   expected-note@#cwg357-B {{defined here}}
 } // namespace cwg357
 
-namespace cwg358 { // cwg358: yes
+namespace cwg358 { // cwg358: 2.7
   extern "C" void cwg358_f();
   namespace N {
     int var;
@@ -1041,7 +1041,7 @@ namespace cwg358 { // cwg358: yes
   }
 } // namespace cwg358
 
-namespace cwg359 { // cwg359: yes
+namespace cwg359 { // cwg359: 3.3
   // Note, the example in the DR is wrong; it doesn't contain an anonymous
   // union.
   struct E {
@@ -1070,7 +1070,7 @@ namespace cwg359 { // cwg359: yes
   };
 } // namespace cwg359
 
-namespace cwg360 { // cwg360: yes
+namespace cwg360 { // cwg360: 2.8
 struct A {
   int foo();
   int bar();
@@ -1102,7 +1102,7 @@ int main() {
 // cwg362: na
 // cwg363: na
 
-namespace cwg364 { // cwg364: yes
+namespace cwg364 { // cwg364: 2.7
   struct S {
     static void f(int);
     void f(char);
@@ -1115,11 +1115,12 @@ namespace cwg364 { // cwg364: yes
   }
 } // namespace cwg364
 
-// cwg366: yes
+namespace cwg366 { // cwg366: 2.7
 #if "foo" // expected-error {{invalid token at start of a preprocessor expression}} 
 #endif
+} // namespace cwg366
 
-namespace cwg367 { // cwg367: yes
+namespace cwg367 { // cwg367: 2.7
   static_assert(__enable_constant_folding(true ? throw 0 : 4), "");
   // expected-error@-1 {{expression is not an integral constant expression}}
   static_assert(__enable_constant_folding(true ? 4 : throw 0), "");
@@ -1288,7 +1289,7 @@ namespace cwg374 { // cwg374: 7
 // cwg375: dup 345
 // cwg376: na
 
-namespace cwg377 { // cwg377: yes
+namespace cwg377 { // cwg377: 2.7
   enum E {
   // expected-error@-1 {{enumeration values exceed range of largest integer}}
     a = -__LONG_LONG_MAX__ - 1,
@@ -1302,7 +1303,7 @@ namespace cwg377 { // cwg377: yes
 // cwg378: dup 276
 // cwg379: na
 
-namespace cwg381 { // cwg381: yes
+namespace cwg381 { // cwg381: 2.7
   struct A {
     int a;
   };
@@ -1322,7 +1323,7 @@ namespace cwg381 { // cwg381: yes
   }
 } // namespace cwg381
 
-namespace cwg382 { // cwg382: yes c++11
+namespace cwg382 { // cwg382: 2.7 c++11
   // FIXME: Should we allow this in C++98 mode?
   struct A { typedef int T; };
   typename A::T t;
@@ -1333,7 +1334,7 @@ namespace cwg382 { // cwg382: yes c++11
   // expected-error@-1 {{expected a qualified name after 'typename'}}
 } // namespace cwg382
 
-namespace cwg383 { // cwg383: yes
+namespace cwg383 { // cwg383: 2.7
   struct A { A &operator=(const A&); };
   struct B { ~B(); };
   union C { C &operator=(const C&); };
@@ -1341,7 +1342,7 @@ namespace cwg383 { // cwg383: yes
   static_assert(!__is_pod(A) && !__is_pod(B) && !__is_pod(C) && !__is_pod(D), "");
 } // namespace cwg383
 
-namespace cwg384 { // cwg384: yes
+namespace cwg384 { // cwg384: 2.7
   namespace N1 {
     template<typename T> struct Base {};
     template<typename T> struct X {
@@ -1722,7 +1723,7 @@ namespace cwg395 { // cwg395: 3.0
   int (S::*q)() = null2;
 } // namespace cwg395
 
-namespace cwg396 { // cwg396: yes
+namespace cwg396 { // cwg396: 3.0
   void f() {
     auto int a();
     // since-cxx11-error@-1 {{'auto' storage class specifier is not permitted in C++11, and will not be supported in future releases}}
@@ -1737,7 +1738,7 @@ namespace cwg396 { // cwg396: yes
 
 // cwg397: sup 1823
 
-namespace cwg398 { // cwg398: yes
+namespace cwg398 { // cwg398: 2.7
   namespace example1 {
     struct S {
       static int const I = 42;
diff --git a/clang/test/CXX/drs/cwg4xx.cpp b/clang/test/CXX/drs/cwg4xx.cpp
index 4051496ca2c4f..bcaf7db04ad3b 100644
--- a/clang/test/CXX/drs/cwg4xx.cpp
+++ b/clang/test/CXX/drs/cwg4xx.cpp
@@ -16,7 +16,7 @@ __extension__ typedef __SIZE_TYPE__ size_t;
 
 namespace std { struct type_info; }
 
-namespace cwg400 { // cwg400: yes
+namespace cwg400 { // cwg400: 2.7
   struct A { int a; struct a {}; }; // #cwg400-A
   struct B { int a; struct a {}; }; // #cwg400-B
   struct C : A, B { using A::a; struct a b; };
@@ -89,7 +89,7 @@ namespace cwg401 { // cwg401: 2.8
   //   since-cxx11-note@#cwg402-f {{candidate template ignored: substitution failure [with T = B, U = typename B::type]: 'type' is a protected member of 'cwg401::B'}}
 } // namespace cwg401
 
-namespace cwg403 { // cwg403: yes
+namespace cwg403 { // cwg403: 2.7
   namespace A {
     struct S {};
     int f(void*);
@@ -106,8 +106,8 @@ namespace cwg403 { // cwg403: yes
 // cwg404: na
 // (NB: also sup 594)
 
-namespace cwg405 { // cwg405: yes
-                  // NB: also dup 218
+namespace cwg405 { // cwg405: 2.7
+                   // NB: also dup 218
   namespace A {
     struct S {};
     void f(S);
@@ -250,7 +250,7 @@ namespace cwg408 { // cwg408: 3.4
   template void R<int>::f();
 } // namespace cwg408
 
-namespace cwg409 { // cwg409: yes
+namespace cwg409 { // cwg409: 2.7
   template<typename T> struct A {
     typedef int B;
     B b1;
@@ -290,7 +290,7 @@ namespace cwg410 { // cwg410: no
 
 // cwg412 is in cwg412.cpp
 
-namespace cwg413 { // cwg413: yes
+namespace cwg413 { // cwg413: 2.7
   struct S {
     int a;
     int : 17;
@@ -320,13 +320,13 @@ namespace cwg414 { // cwg414: dup 305
   }
 } // namespace cwg414
 
-namespace cwg415 { // cwg415: yes
+namespace cwg415 { // cwg415: 2.7
   template<typename T> void f(T, ...) { T::error; }
   void f(int, int);
   void g() { f(0, 0); } // ok
 } // namespace cwg415
 
-namespace cwg416 { // cwg416: yes
+namespace cwg416 { // cwg416: 2.7
   extern struct A a;
   int &operator+(const A&, const A&);
   int &k = a + a;
@@ -466,7 +466,7 @@ namespace cwg420 { // cwg420: 9
 #endif
 } // namespace cwg420
 
-namespace cwg421 { // cwg421: yes
+namespace cwg421 { // cwg421: 2.7
   struct X { X(); int n; int &r; };
   int *p = &X().n;
   // cxx98-error@-1 {{taking the address of a temporary object of type 'int'}}
@@ -474,7 +474,7 @@ namespace cwg421 { // cwg421: yes
   int *q = &X().r;
 } // namespace cwg421
 
-namespace cwg422 { // cwg422: yes
+namespace cwg422 { // cwg422: 2.7
   template<typename T, typename U> void f() {
     typedef T type; // #cwg422-typedef-T
     typedef U type;
@@ -486,12 +486,12 @@ namespace cwg422 { // cwg422: yes
   template void f<int, char>(); // #cwg422-f-int-char
 } // namespace cwg422
 
-namespace cwg423 { // cwg423: yes
+namespace cwg423 { // cwg423: 2.7
   template<typename T> struct X { operator T&(); };
   void f(X<int> x) { x += 1; }
 } // namespace cwg423
 
-namespace cwg424 { // cwg424: yes
+namespace cwg424 { // cwg424: 2.7
   struct A {
     typedef int N; // #cwg424-N
     typedef int N;
@@ -521,11 +521,11 @@ namespace cwg424 { // cwg424: yes
   };
 } // namespace cwg424
 
-namespace cwg425 { // cwg425: yes
+namespace cwg425 { // cwg425: 2.7
   struct A { template<typename T> operator T() const; } a;
   float f = 1.0f * a;
   // expected-error@-1 {{use of overloaded operator '*' is ambiguous (with operand types 'float' and 'struct A')}}
-  //   expected-note@-2 +{{built-in candidate}}
+  //   expected-note@-2 +{{built-in candidate operator*}}
 
   template<typename T> struct is_float;
   template<> struct is_float<float> { typedef void type; };
@@ -537,7 +537,7 @@ namespace cwg425 { // cwg425: yes
   float g = 1.0f * b; // ok
 } // namespace cwg425
 
-namespace cwg427 { // cwg427: yes
+namespace cwg427 { // cwg427: 2.7
   struct B {};
   struct D : public B {
     D(B &) = delete; // #cwg427-D
@@ -553,28 +553,28 @@ namespace cwg427 { // cwg427: yes
   //   expected-note@#cwg427-D {{'D' has been explicitly marked deleted here}}
 } // namespace cwg427
 
-namespace cwg428 { // cwg428: yes
+namespace cwg428 { // cwg428: 2.7
   template<typename T> T make();
   extern struct X x; // #cwg428-X
   void f() {
     throw void();
-    // expected-error@-1 {{cannot throw}}
+    // expected-error@-1 {{cannot throw object of incomplete type 'void'}}
     throw make<void*>();
     throw make<const volatile void*>();
     throw x;
-    // expected-error@-1 {{cannot throw}}
+    // expected-error@-1 {{cannot throw object of incomplete type 'struct X'}}
     //   expected-note@#cwg428-X {{forward declaration of 'cwg428::X'}}
     throw make<X&>();
-    // expected-error@-1 {{cannot throw}}
+    // expected-error@-1 {{cannot throw object of incomplete type 'cwg428::X'}}
     //   expected-note@#cwg428-X {{forward declaration of 'cwg428::X'}}
     throw make<X*>();
-    // expected-error@-1 {{cannot throw}}
+    // expected-error@-1 {{cannot throw pointer to object of incomplete type 'cwg428::X'}}
     //   expected-note@#cwg428-X {{forward declaration of 'cwg428::X'}}
     throw make<const volatile X&>();
-    // expected-error@-1 {{cannot throw}}
+    // expected-error@-1 {{cannot throw object of incomplete type 'cwg428::X'}}
     //   expected-note@#cwg428-X {{forward declaration of 'cwg428::X'}}
     throw make<const volatile X*>();
-    // expected-error@-1 {{cannot throw}}
+    // expected-error@-1 {{cannot throw pointer to object of incomplete type 'const volatile cwg428::X'}}
     //   expected-note@#cwg428-X {{forward declaration of 'cwg428::X'}}
   }
 } // namespace cwg428
@@ -594,7 +594,7 @@ namespace cwg429 { // cwg429: 2.8 c++11
   } *b = new (0) B; // ok, second delete is not a non-placement deallocation function
 } // namespace cwg429
 
-namespace cwg430 { // cwg430: yes c++11
+namespace cwg430 { // cwg430: 2.7 c++11
   // resolved by n2239
   // FIXME: This should apply in C++98 too.
   void f(int n) {
@@ -603,7 +603,7 @@ namespace cwg430 { // cwg430: yes c++11
   }
 } // namespace cwg430
 
-namespace cwg431 { // cwg431: yes
+namespace cwg431 { // cwg431: 2.8
   struct A {
     template<typename T> T *get();
     template<typename T> struct B {
@@ -648,7 +648,7 @@ namespace cwg432 { // cwg432: 3.0
 #endif
 } // namespace cwg432
 
-namespace cwg433 { // cwg433: yes
+namespace cwg433 { // cwg433: 2.7
   template<class T> struct S {
     void f(union U*);
   };
@@ -678,7 +678,7 @@ namespace cwg434 { // cwg434: sup 2352
 
 // cwg435: na
 
-namespace cwg436 { // cwg436: yes
+namespace cwg436 { // cwg436: 2.7
   enum E { f }; // #cwg436-f
   void f();
   // expected-error@-1 {{redefinition of 'f' as different kind of symbol}}
@@ -710,7 +710,7 @@ namespace cwg437 { // cwg437: sup 1308
 // cwg442: sup 348
 // cwg443: na
 
-namespace cwg444 { // cwg444: yes
+namespace cwg444 { // cwg444: 2.7
   struct D;
   struct B { // #cwg444-B
     D &operator=(D &) = delete; // #cwg444-deleted
@@ -773,7 +773,7 @@ namespace cwg446 { // cwg446: 2.8
   }
 } // namespace cwg446
 
-namespace cwg447 { // cwg447: yes
+namespace cwg447 { // cwg447: 2.8
   struct A { int n; int a[4]; };
   template<int> struct U {
     typedef int type;
@@ -821,7 +821,7 @@ namespace cwg448 { // cwg448: 2.8
 
 // cwg449: na
 
-namespace cwg450 { // cwg450: yes
+namespace cwg450 { // cwg450: 3.2
   typedef int A[3];
   void f1(const A &);
   void f2(A &); // #cwg450-f2
@@ -842,7 +842,7 @@ namespace cwg450 { // cwg450: yes
 #endif
 } // namespace cwg450
 
-namespace cwg451 { // cwg451: yes
+namespace cwg451 { // cwg451: 2.7
   const int a = 1 / 0;
   // expected-warning@-1 {{division by zero is undefined}}
   const int b = 1 / 0; // #cwg451-b
@@ -853,7 +853,7 @@ namespace cwg451 { // cwg451: yes
   //   expected-note@#cwg451-b {{declared here}}
 } // namespace cwg451
 
-namespace cwg452 { // cwg452: yes
+namespace cwg452 { // cwg452: 2.7
   struct A {
     int a, b, c;
     A *p;
@@ -864,7 +864,7 @@ namespace cwg452 { // cwg452: yes
 
 // cwg454 FIXME write a codegen test
 
-namespace cwg456 { // cwg456: yes
+namespace cwg456 { // cwg456: 3.4
   // sup 903 c++11
   const int null = 0;
   void *p = null;
@@ -877,7 +877,7 @@ namespace cwg456 { // cwg456: yes
   // since-cxx11-error@-2 {{cannot initialize a variable of type 'void *' with an lvalue of type 'const bool'}}
 } // namespace cwg456
 
-namespace cwg457 { // cwg457: yes
+namespace cwg457 { // cwg457: 2.7
   const int a = 1;
   const volatile int b = 1;
   static_assert(a, "");
@@ -934,7 +934,7 @@ namespace cwg458 { // cwg458: 11
   }
 } // namespace cwg458
 
-namespace cwg460 { // cwg460: yes
+namespace cwg460 { // cwg460: 2.7
   namespace X { namespace Q { int n; } }
   namespace Y {
     using X;
@@ -985,7 +985,7 @@ void g(int a, CI b, VI c) {
 }
 } // namespace cwg466
 
-namespace cwg467 { // cwg467: yes
+namespace cwg467 { // cwg467: 2.7
   int stuff();
 
   int f() {
@@ -1007,7 +1007,7 @@ namespace cwg467 { // cwg467: yes
   }
 } // namespace cwg467
 
-namespace cwg468 { // cwg468: yes c++11
+namespace cwg468 { // cwg468: 2.7 c++11
   // FIXME: Should we allow this in C++98 too?
   template<typename> struct A {
     template<typename> struct B {
@@ -1027,7 +1027,7 @@ namespace cwg469 { // cwg469: no
   //   expected-note@#cwg469-X {{template is declared here}}
 } // namespace cwg469
 
-namespace cwg470 { // cwg470: yes
+namespace cwg470 { // cwg470: 2.7
   template<typename T> struct A {
     struct B {};
   };
@@ -1115,7 +1115,7 @@ namespace cwg477 { // cwg477: 3.5
   // expected-error@-1 {{can only be specified inside the class definition}}
 } // namespace cwg477
 
-namespace cwg478 { // cwg478: yes
+namespace cwg478 { // cwg478: 2.7
   struct A { virtual void f() = 0; }; // #cwg478-f
   void f(A *a);
   void f(A a[10]);
@@ -1164,7 +1164,7 @@ namespace cwg479 { // cwg479: 2.8
   }
 } // namespace cwg479
 
-namespace cwg480 { // cwg480: yes
+namespace cwg480 { // cwg480: 2.7
   struct A { int n; };
   struct B : A {};
   struct C : virtual B {};
@@ -1279,7 +1279,7 @@ namespace cwg482 { // cwg482: 3.5
   };
 } // namespace cwg482
 
-namespace cwg483 { // cwg483: yes
+namespace cwg483 { // cwg483: 2.7
   namespace climits {
     static_assert(__SCHAR_MAX__ >= 127, "");
     static_assert(__SHRT_MAX__ >= 32767, "");
@@ -1296,7 +1296,7 @@ namespace cwg483 { // cwg483: yes
   }
 } // namespace cwg483
 
-namespace cwg484 { // cwg484: yes
+namespace cwg484 { // cwg484: 2.8
   struct A {
     A();
     void f();
@@ -1334,7 +1334,7 @@ namespace cwg484 { // cwg484: yes
   } S;
 } // namespace cwg484
 
-namespace cwg485 { // cwg485: yes
+namespace cwg485 { // cwg485: 2.7
   namespace N {
     struct S {};
     int operator+(S, S);
@@ -1347,7 +1347,7 @@ namespace cwg485 { // cwg485: yes
   int b = f<int>(s);
 } // namespace cwg485
 
-namespace cwg486 { // cwg486: yes
+namespace cwg486 { // cwg486: 2.7
   template<typename T> T f(T *); // #cwg486-f
   int &f(...);
 
@@ -1366,7 +1366,7 @@ namespace cwg486 { // cwg486: yes
   }
 } // namespace cwg486
 
-namespace cwg487 { // cwg487: yes
+namespace cwg487 { // cwg487: 2.7
   enum E { e };
   int operator+(int, E); // #cwg487-operator-plus
   static_assert(4 + e, "");
@@ -1375,7 +1375,7 @@ namespace cwg487 { // cwg487: yes
   //   since-cxx11-note@#cwg487-operator-plus {{declared here}}
 } // namespace cwg487
 
-namespace cwg488 { // cwg488: yes c++11
+namespace cwg488 { // cwg488: 2.9 c++11
   template <typename T> void f(T);
   void f(int);
   void g() {
@@ -1519,7 +1519,7 @@ namespace cwg497 { // cwg497: sup 253
   }
 } // namespace cwg497
 
-namespace cwg499 { // cwg499: yes
+namespace cwg499 { // cwg499: 2.7
   extern char str[];
   void f() { throw str; }
 } // namespace cwg499
diff --git a/clang/test/CXX/drs/cwg5xx.cpp b/clang/test/CXX/drs/cwg5xx.cpp
index 3df69b5450949..1fdfe5785c5c4 100644
--- a/clang/test/CXX/drs/cwg5xx.cpp
+++ b/clang/test/CXX/drs/cwg5xx.cpp
@@ -40,7 +40,7 @@ namespace cwg500 { // cwg500: dup 372
   class D : public A::B {};
 } // namespace cwg500
 
-namespace cwg501 { // cwg501: yes
+namespace cwg501 { // cwg501: 2.7
   struct A {
     friend void f() {}
     void g() {
@@ -50,7 +50,7 @@ namespace cwg501 { // cwg501: yes
   };
 } // namespace cwg501
 
-namespace cwg502 { // cwg502: yes
+namespace cwg502 { // cwg502: 2.7
   struct Q {};
   template<typename T> struct A {
     enum E { e = 1 };
@@ -65,7 +65,7 @@ namespace cwg502 { // cwg502: yes
   template struct A<int>;
 } // namespace cwg502
 
-namespace cwg505 { // cwg505: yes
+namespace cwg505 { // cwg505: 2.7
   const char *exts = "\e\(\{\[\%";
   // expected-error@-1 {{use of non-standard escape character '\e'}}
   // expected-error@-2 {{use of non-standard escape character '\('}}
@@ -76,7 +76,7 @@ namespace cwg505 { // cwg505: yes
   // expected-error@-1 {{unknown escape sequence '\Q'}}
 } // namespace cwg505
 
-namespace cwg506 { // cwg506: yes
+namespace cwg506 { // cwg506: 2.7
   struct NonPod { ~NonPod(); };
   void f(...);
   void g(NonPod np) { f(np); }
@@ -91,7 +91,7 @@ namespace cwg506 { // cwg506: yes
 // cwg509: na
 // cwg510: na
 
-namespace cwg512 { // cwg512: yes
+namespace cwg512 { // cwg512: 3.0
   struct A { // #cwg512-A
     A(int); // #cwg512-A-ctor
   };
@@ -103,7 +103,7 @@ namespace cwg512 { // cwg512: yes
 
 // cwg513: na
 
-namespace cwg514 { // cwg514: yes
+namespace cwg514 { // cwg514: 2.7
   namespace A { extern int x, y; }
   int A::x = y;
 } // namespace cwg514
@@ -146,7 +146,7 @@ namespace cwg517 { // cwg517: no
   template<typename T> int v<T&> = 0;
 } // namespace cwg517
 
-namespace cwg518 { // cwg518: yes c++11
+namespace cwg518 { // cwg518: 2.7 c++11
   enum E { e, };
   // cxx98-error@-1 {{commas at the end of enumerator lists are a C++11 extension}}
 } // namespace cwg518
@@ -158,7 +158,7 @@ namespace cwg518 { // cwg518: yes c++11
 // FIXME: The wording here is broken. It's not reasonable to expect a
 // diagnostic here. Once the relevant DR gets a number, mark this as a dup.
 
-namespace cwg522 { // cwg522: yes
+namespace cwg522 { // cwg522: 2.7
   struct S {};
   template<typename T> void b1(volatile T &);
   template<typename T> void b2(volatile T * const *);
@@ -190,7 +190,7 @@ namespace cwg522 { // cwg522: yes
   }
 } // namespace cwg522
 
-namespace cwg524 { // cwg524: yes
+namespace cwg524 { // cwg524: 2.7
   template<typename T> void f(T a, T b) { operator+(a, b); }
   // expected-error@-1 {{call to function 'operator+' that is neither visible in the template definition nor found by argument-dependent lookup}}
   //   expected-note@#cwg524-f-N-S {{in instantiation of function template specialization 'cwg524::f<cwg524::N::S>' requested here}}
@@ -205,7 +205,7 @@ namespace cwg524 { // cwg524: yes
   template void f(N::S, N::S); // #cwg524-f-N-S
 } // namespace cwg524
 
-namespace cwg525 { // cwg525: yes
+namespace cwg525 { // cwg525: 2.7
   namespace before {
     // Note, the example was correct prior to the change; instantiation is
     // required for cases like this:
@@ -224,7 +224,7 @@ namespace cwg525 { // cwg525: yes
   }
 } // namespace cwg525
 
-namespace cwg526 { // cwg526: yes
+namespace cwg526 { // cwg526: 2.7
   template<int> struct S {};
   template<int N> void f1(S<N> s);
   template<int N> void f2(S<(N)> s); // #cwg526-f2
@@ -287,7 +287,7 @@ void f() {
 
 } // namespace cwg528
 
-namespace cwg530 { // cwg530: yes
+namespace cwg530 { // cwg530: 2.7
   template<int*> struct S { enum { N = 1 }; };
   template<void(*)()> struct T { enum { N = 1 }; };
   int n;
@@ -439,7 +439,7 @@ namespace cwg534 { // cwg534: 2.9
   // expected-error@-1 {{function template partial specialization is not allowed}}
 } // namespace cwg534
 
-namespace cwg535 { // cwg535: yes
+namespace cwg535 { // cwg535: 3.1
   class X { private: X(const X&); };
   struct A {
     X x;
@@ -474,8 +474,8 @@ namespace cwg535 { // cwg535: yes
 // cwg537: na
 // cwg538: na
 
-// cwg539: yes
-const cwg539(
+namespace cwg539 { // cwg539: 3.4
+const f(
 // expected-error@-1 {{a type specifier is required for all declarations}}
     const a) {
     // expected-error@-1 {{unknown type name 'a'}}
@@ -509,11 +509,11 @@ const cwg539(
   int arr[3];
   // FIXME: The extra braces here are to avoid the parser getting too
   // badly confused when recovering here. We should fix this recovery.
-  { for (const n
+  { for (const n // #cwg539-for
   // since-cxx11-error@-1 {{unknown type name 'n'}}
-  //   since-cxx11-note@-2 {{}}
          : arr) ; {} }
          // since-cxx11-error@-1 +{{}}
+         //   since-cxx11-note@#cwg539-for {{}}
   (void) [](const) {};
   // since-cxx11-error@-1 {{a type specifier is required for all declarations}}
   (void) [](const n) {};
@@ -526,8 +526,9 @@ const cwg539(
   // since-cxx11-error@-1 {{expected a type}}
 #endif
 }
+} // namespace cwg539
 
-namespace cwg540 { // cwg540: yes
+namespace cwg540 { // cwg540: 2.7
   typedef int &a;
   typedef const a &a;
   // expected-warning@-1 {{'const' qualifier on reference type 'a' (aka 'int &') has no effect}}
@@ -541,7 +542,7 @@ namespace cwg540 { // cwg540: yes
   // expected-warning@#cwg540-typedef-b-c {{'const' qualifier on reference type 'b' (aka 'const int &') has no effect}}
 } // namespace cwg540
 
-namespace cwg541 { // cwg541: yes
+namespace cwg541 { // cwg541: 2.7
   template<int> struct X { typedef int type; };
   template<typename T> struct S {
     int f(T);
@@ -571,7 +572,7 @@ namespace cwg541 { // cwg541: yes
   };
 } // namespace cwg541
 
-namespace cwg542 { // cwg542: yes
+namespace cwg542 { // cwg542: 3.5
 #if __cplusplus >= 201103L
   // In C++20 A and B are no longer aggregates and thus the constructor is
   // called, which fails.
@@ -609,7 +610,7 @@ namespace cwg543 { // cwg543: 3.0
   //   since-cxx11-note@#cwg543-A-n {{default constructor of 'A' is implicitly deleted because field 'n' of const-qualified type 'const int' would not be initialized}}
 } // namespace cwg543
 
-namespace cwg544 { // cwg544: yes
+namespace cwg544 { // cwg544: 2.7
   int *n;
 
   template<class T> struct A { int n; };
@@ -618,7 +619,7 @@ namespace cwg544 { // cwg544: yes
   int k = B<int>().get();
 } // namespace cwg544
 
-namespace cwg546 { // cwg546: yes
+namespace cwg546 { // cwg546: 2.7
   template<typename T> struct A { void f(); };
   template struct A<int>;
   template<typename T> void A<T>::f() { T::error; }
@@ -642,7 +643,7 @@ namespace cwg548 { // cwg548: dup 482
 
 // cwg550: dup 393
 
-namespace cwg551 { // cwg551: yes c++11
+namespace cwg551 { // cwg551: 2.7 c++11
   // FIXME: This obviously should apply in C++98 mode too.
   template<typename T> void f() {}
   template inline void f<int>();
@@ -659,7 +660,7 @@ namespace cwg551 { // cwg551: yes c++11
   // since-cxx11-error@-1 {{explicit instantiation cannot be 'inline'}}
 } // namespace cwg551
 
-namespace cwg552 { // cwg552: yes
+namespace cwg552 { // cwg552: 2.7
   template<typename T, typename T::U> struct X {};
   struct Y { typedef int U; };
   X<Y, 0> x;
@@ -768,7 +769,9 @@ namespace cwg558 { // cwg558: 2.9
   wchar_t h = L'\xE000';
 } // namespace cwg558
 
-template<typename> struct cwg559 { typedef int T; cwg559::T u; }; // cwg559: yes
+namespace cwg559 { // cwg559: 2.7
+template<typename> struct S { typedef int T; S::T u; };
+} // namespace cwg559
 
 namespace cwg560 { // cwg560: 16
 
@@ -784,7 +787,7 @@ Outer<T>::Inner* Outer<T>::Inner::self() { return this; }
 
 } // namespace cwg560
 
-namespace cwg561 { // cwg561: yes
+namespace cwg561 { // cwg561: 2.7
   template<typename T> void f(int);
   template<typename T> void g(T t) {
     f<T>(t);
@@ -801,14 +804,14 @@ namespace cwg561 { // cwg561: yes
 // cwg562: na
 // cwg563 is in cwg563.cpp
 
-namespace cwg564 { // cwg564: yes
+namespace cwg564 { // cwg564: 2.7
   extern "C++" void f(int);
   void f(int); // ok
   extern "C++" { extern int n; }
   int n; // ok
 } // namespace cwg564
 
-namespace cwg565 { // cwg565: yes
+namespace cwg565 { // cwg565: 2.7
   namespace N {
     template<typename T> int f(T); // #cwg565-f
   }
@@ -824,7 +827,7 @@ namespace cwg565 { // cwg565: yes
   //   expected-note@#cwg565-using {{using declaration}}
 } // namespace cwg565
 
-namespace cwg566 { // cwg566: yes
+namespace cwg566 { // cwg566: 3.1
 #if __cplusplus >= 201103L
   static_assert(int(-3.99) == -3, "");
 #endif
@@ -873,7 +876,7 @@ namespace cwg568 { // cwg568: 3.0 c++11
   }
 } // namespace cwg568
 
-namespace cwg569 { // cwg569: yes c++11
+namespace cwg569 { // cwg569: 2.7 c++11
   // FIXME: This is a DR issue against C++98, so should probably apply there
   // too.
   ;;;;;
@@ -890,7 +893,7 @@ namespace cwg570 { // cwg570: dup 633
 
 // cwg571 is in cwg571.cpp
 
-namespace cwg572 { // cwg572: yes
+namespace cwg572 { // cwg572: 2.7
   enum E { a = 1, b = 2 };
   static_assert(a + b == 3, "");
 } // namespace cwg572
@@ -967,7 +970,7 @@ namespace cwg574 { // cwg574: 3.0
   };
 } // namespace cwg574
 
-namespace cwg575 { // cwg575: yes
+namespace cwg575 { // cwg575: 2.7
   template<typename T, typename U = typename T::type> void a(T); void a(...);
   // cxx98-error@-1 {{default template arguments for a function template are a C++11 extension}}
   template<typename T, typename T::type U = 0> void b(T); void b(...);
@@ -1140,7 +1143,7 @@ namespace cwg587 { // cwg587: 3.2
   template void f(bool, const S, S);
 } // namespace cwg587
 
-namespace cwg588 { // cwg588: yes
+namespace cwg588 { // cwg588: 2.7
   struct A { int n; }; // #cwg588-A
   template<typename T> int f() {
     struct S : A, T { int f() { return n; } } s;
@@ -1155,7 +1158,7 @@ namespace cwg588 { // cwg588: yes
   int k = f<B>(); // #cwg588-k
 } // namespace cwg588
 
-namespace cwg589 { // cwg589: yes
+namespace cwg589 { // cwg589: 2.7
   struct B { };
   struct D : B { };
   D f();
@@ -1167,7 +1170,7 @@ namespace cwg589 { // cwg589: yes
   // expected-error@-1 {{taking the address of a temporary object of type 'const B'}}
 } // namespace cwg589
 
-namespace cwg590 { // cwg590: yes
+namespace cwg590 { // cwg590: 2.7
   template<typename T> struct A {
     struct B {
       struct C {
@@ -1264,7 +1267,7 @@ namespace cwg595 { // cwg595: dup 1330
 
 // cwg597: na
 
-namespace cwg598 { // cwg598: yes
+namespace cwg598 { // cwg598: 2.7
   namespace N {
     void f(int);
     void f(char);
diff --git a/clang/test/CXX/drs/cwg6xx.cpp b/clang/test/CXX/drs/cwg6xx.cpp
index 3bd209799e419..fb6acde459d9c 100644
--- a/clang/test/CXX/drs/cwg6xx.cpp
+++ b/clang/test/CXX/drs/cwg6xx.cpp
@@ -33,7 +33,7 @@ namespace std {
   __extension__ typedef __SIZE_TYPE__ size_t;
 } // namespace std
 
-namespace cwg601 { // cwg601: yes
+namespace cwg601 { // cwg601: 2.7
 #if __cplusplus >= 201103L
 #define MAX __LLONG_MAX__
 #else
@@ -63,7 +63,7 @@ static_assert(0x8000000000000000 < -1, "0x8000000000000000 should be unsigned");
 #undef MAX
 } // namespace cwg601
 
-namespace cwg602 { // cwg602: yes
+namespace cwg602 { // cwg602: 2.7
   template<class T> struct A {
     template<class U> friend struct A;
   };
@@ -78,11 +78,11 @@ namespace cwg602 { // cwg602: yes
   B<int> b;
 } // namespace cwg602
 
-namespace cwg603 { // cwg603: yes
+namespace cwg603 { // cwg603: 3.1
   template<unsigned char> struct S {};
   typedef S<'\001'> S1;
   typedef S<(1ul << __CHAR_BIT__) + 1> S1;
-  // since-cxx11-error@-1 {{cannot be narrowed}}
+  // since-cxx11-error@-1 {{non-type template argument evaluates to 257, which cannot be narrowed to type 'unsigned char'}}
 } // namespace cwg603
 
 // cwg604: na
@@ -110,7 +110,7 @@ namespace cwg606 { // cwg606: 3.0
 #endif
 } // namespace cwg606
 
-namespace cwg607 { // cwg607: yes
+namespace cwg607 { // cwg607: 2.7
 namespace example1 {
 struct Y {};
 
@@ -138,23 +138,25 @@ N::D::D() : typedef_B(0) {}
 } // namespace example2
 } // namespace cwg607
 
-namespace cwg608 { // cwg608: yes
+namespace cwg608 { // cwg608: 2.7
   struct A { virtual void f(); };
   struct B : A {};
   struct C : A { void f(); };
   struct D : B, C {};
 } // namespace cwg608
 
-static_assert(-0u == 0u, ""); // cwg610: yes
+namespace cwg610 { // cwg610: 2.7
+static_assert(-0u == 0u, "");
+} // namespace cwg610
 
-namespace cwg611 { // cwg611: yes
+namespace cwg611 { // cwg611: 2.7
   int k;
   struct S { int &r; } s = { k ? k : k };
 } // namespace cwg611
 
 // cwg612: na
 
-namespace cwg613 { // cwg613: yes c++11
+namespace cwg613 { // cwg613: 3.1 c++11
   // see also n2253
   struct A { int n; static void f(); };
   int f(int);
@@ -191,10 +193,12 @@ namespace cwg613 { // cwg613: yes c++11
   }
 } // namespace cwg613
 
-static_assert((-1) / 2 == 0, ""); // cwg614: yes
+namespace cwg614 { // cwg614: 2.7
+static_assert((-1) / 2 == 0, "");
 static_assert((-1) % 2 == -1, "");
+} // namespace cwg614
 
-namespace cwg615 { // cwg615: yes
+namespace cwg615 { // cwg615: 2.7
   int f();
   static int n = f();
 } // namespace cwg615
@@ -217,13 +221,13 @@ namespace cwg616 { // cwg616: 4
 #endif
 } // namespace cwg616
 
-namespace cwg618 { // cwg618: yes
+namespace cwg618 { // cwg618: 2.7
 #if (unsigned)-1 > 0
 #error wrong
 #endif
 } // namespace cwg618
 
-namespace cwg619 { // cwg619: yes
+namespace cwg619 { // cwg619: 3.4
   extern int x[10];
   struct S { static int x[10]; };
 
@@ -244,7 +248,7 @@ namespace cwg619 { // cwg619: yes
 
 // cwg620: dup 568
 
-namespace cwg621 { // cwg621: yes
+namespace cwg621 { // cwg621: 2.7
   template<typename T> T f();
   template<> int f() {} // #cwg621-f
   template<> int f<int>() {}
@@ -257,7 +261,7 @@ namespace cwg621 { // cwg621: yes
 
 // cwg624 needs a libc++abi test.
 
-namespace cwg625 { // cwg625: yes
+namespace cwg625 { // cwg625: 2.9
   template<typename T> struct A {};
   A<auto> x = A<int>();
   // cxx98-error@-1 {{'auto' type specifier is a C++11 extension}}
@@ -268,14 +272,14 @@ namespace cwg625 { // cwg625: yes
   // expected-error@-2 {{'auto' not allowed in function prototype}}
 } // namespace cwg625
 
-namespace cwg626 { // cwg626: yes
+namespace cwg626 { // cwg626: 2.7
 #define STR(x) #x
   char c[2] = STR(c); // ok, type matches
   wchar_t w[2] = STR(w);
   // expected-error@-1 {{initializing wide char array with non-wide string literal}}
 } // namespace cwg626
 
-namespace cwg627 { // cwg627: yes
+namespace cwg627 { // cwg627: 2.7
   void f() {
     // FIXME: emitted diagnostic have a room for improvement
     true a = 0;
@@ -300,7 +304,7 @@ namespace cwg629 { // cwg629: 2.9
   }
 } // namespace cwg629
 
-namespace cwg630 { // cwg630: yes
+namespace cwg630 { // cwg630: 2.7
 const bool MB_EQ_WC =
     ' ' == L' ' && '\t' == L'\t' && '\v' == L'\v' && '\r' == L'\r' &&
     '\n' == L'\n' && //
@@ -336,7 +340,7 @@ static_assert(MB_EQ_WC, "!__STDC_MB_MIGHT_NEQ_WC__ but some character differs");
 
 // cwg631: na
 
-namespace cwg632 { // cwg632: yes
+namespace cwg632 { // cwg632: 2.7
   struct S { int n; } s = {{5}};
   // expected-warning@-1 {{braces around scalar initializer}}
 } // namespace cwg632
@@ -344,7 +348,7 @@ namespace cwg632 { // cwg632: yes
 // cwg633: na
 // see also n2993
 
-namespace cwg634 { // cwg634: yes
+namespace cwg634 { // cwg634: 2.7
   struct S { S(); S(const S&); virtual void f(); ~S(); };
   int f(...);
   char f(int);
@@ -356,7 +360,7 @@ namespace cwg634 { // cwg634: yes
   // since-cxx11-error@-2 {{cannot pass object of non-trivial type 'S' through variadic function; call will abort at runtime}}
 } // namespace cwg634
 
-namespace cwg635 { // cwg635: yes
+namespace cwg635 { // cwg635: 2.7
   template<typename T> struct A { A(); ~A(); };
   template<typename T> A<T>::A<T>() {}
   // expected-error@-1 {{out-of-line constructor for 'A' cannot have template arguments}}
@@ -383,7 +387,7 @@ namespace cwg635 { // cwg635: yes
   //   expected-note@#cwg635-D {{previous definition is here}}
 } // namespace cwg635
 
-namespace cwg637 { // cwg637: yes
+namespace cwg637 { // cwg637: 3.0
   void f(int i) {
     i = ++i + 1;
     i = i++ + 1;
@@ -433,7 +437,7 @@ namespace cwg639 { // cwg639: 3.3
   }
 } // namespace cwg639
 
-namespace cwg641 { // cwg641: yes
+namespace cwg641 { // cwg641: 2.7
   namespace std_example {
     struct abc;
 
@@ -483,7 +487,7 @@ namespace cwg641 { // cwg641: yes
   }
 } // namespace cwg641
 
-namespace cwg642 { // cwg642: yes
+namespace cwg642 { // cwg642: 2.7
   void f() {
     const int i = 2;
     {
@@ -623,7 +627,7 @@ namespace cwg647 { // cwg647: 3.1
 #endif
 } // namespace cwg647
 
-namespace cwg648 { // cwg648: yes
+namespace cwg648 { // cwg648: 2.7
 #if __cplusplus >= 201103L
   int f();
   constexpr int a = (true ? 1 : f());
@@ -651,7 +655,7 @@ struct Y {
 
 // cwg650 is in cwg650.cpp
 
-namespace cwg651 { // cwg651: yes
+namespace cwg651 { // cwg651: 2.7
 #if __cplusplus >= 201103L
   struct X {
     virtual X &f();
@@ -664,7 +668,7 @@ namespace cwg651 { // cwg651: yes
 #endif
 } // namespace cwg651
 
-namespace cwg652 { // cwg652: yes
+namespace cwg652 { // cwg652: 3.1
 #if __cplusplus >= 201103L
   constexpr int n = 1.2 * 3.4;
   static_assert(n == 4, "");
@@ -700,7 +704,7 @@ namespace cwg654 { // cwg654: sup 1423
 #endif
 } // namespace cwg654
 
-namespace cwg655 { // cwg655: yes
+namespace cwg655 { // cwg655: 3.0
   struct A { A(int); }; // #cwg655-A
   struct B : A {
     A a; // #cwg655-a
@@ -718,7 +722,7 @@ namespace cwg655 { // cwg655: yes
   };
 } // namespace cwg655
 
-namespace cwg656 { // cwg656: yes
+namespace cwg656 { // cwg656: 2.8
   struct A { A(const A&) = delete; };
   // cxx98-error@-1 {{deleted function definitions are a C++11 extension}}
   struct B : A {};
@@ -830,7 +834,7 @@ namespace cwg660 { // cwg660: 3.0
 
 // cwg661 is in cwg661.cpp
 
-namespace cwg662 { // cwg662: yes
+namespace cwg662 { // cwg662: 2.7
   template <typename T> void f(T t) {
     T &tr = t;
     T *tp = &t;
@@ -847,7 +851,7 @@ namespace cwg663 { // cwg663: sup P1949
   int ЍЎ = 123;
 } // namespace cwg663
 
-namespace cwg664 { // cwg664: yes
+namespace cwg664 { // cwg664: 2.7
 #if __cplusplus >= 201103L
   struct A { A(const A&) = delete; };
   A &&f(A &&a, int n) {
@@ -932,7 +936,7 @@ namespace cwg667 { // cwg667: 8
 
 // cwg668 needs an libc++abi test
 
-namespace cwg669 { // cwg669: yes
+namespace cwg669 { // cwg669: 3.1
 #if __cplusplus >= 201103L
   void f() {
     int n;
@@ -972,7 +976,7 @@ namespace cwg671 { // cwg671: 2.9
 
 // cwg672 is in cwg672.cpp
 
-namespace cwg673 { // cwg673: yes
+namespace cwg673 { // cwg673: 2.7
   template<typename> struct X { static const int n = 0; };
 
   class A {
@@ -1096,7 +1100,7 @@ namespace cwg677 { // cwg677: no
 
 // cwg678 FIXME: check that the modules ODR check catches this
 
-namespace cwg679 { // cwg679: yes
+namespace cwg679 { // cwg679: 2.7
   struct X {};
   template<int> void operator+(X, X);
   template<> void operator+<0>(X, X) {} // #cwg679-def
@@ -1130,7 +1134,7 @@ namespace cwg681 { // cwg681: partial
 #endif
 } // namespace cwg681
 
-namespace cwg683 { // cwg683: yes
+namespace cwg683 { // cwg683: 3.3
 #if __cplusplus >= 201103L
   struct A {
     A() = default;
@@ -1160,7 +1164,7 @@ namespace cwg684 { // cwg684: sup 1454
 #endif
 } // namespace cwg684
 
-namespace cwg685 { // cwg685: yes
+namespace cwg685 { // cwg685: 10
   enum E : long { e };
   // cxx98-error@-1 {{enumeration types with a fixed underlying type are a C++11 extension}}
   void f(int);
diff --git a/clang/test/CXX/drs/cwg7xx.cpp b/clang/test/CXX/drs/cwg7xx.cpp
index 0169657312e57..a8ab2e2207167 100644
--- a/clang/test/CXX/drs/cwg7xx.cpp
+++ b/clang/test/CXX/drs/cwg7xx.cpp
@@ -11,7 +11,7 @@
 // cxx98-error@-1 {{variadic macros are a C99 feature}}
 #endif
 
-namespace cwg705 { // cwg705: yes
+namespace cwg705 { // cwg705: 2.7
   namespace N {
     struct S {};
     void f(S); // #cwg705-f
diff --git a/clang/test/CXX/drs/cwg9xx.cpp b/clang/test/CXX/drs/cwg9xx.cpp
index c07c276c021d1..96e46742650d6 100644
--- a/clang/test/CXX/drs/cwg9xx.cpp
+++ b/clang/test/CXX/drs/cwg9xx.cpp
@@ -144,7 +144,7 @@ class B : A {
 
 } // namespace cwg960
 
-namespace cwg974 { // cwg974: yes
+namespace cwg974 { // cwg974: 3.3
 #if __cplusplus >= 201103L
   void test() {
     auto lam = [](int x = 42) { return x; };
@@ -152,7 +152,7 @@ namespace cwg974 { // cwg974: yes
 #endif
 } // namespace cwg974
 
-namespace cwg977 { // cwg977: yes
+namespace cwg977 { // cwg977: 2.7
 enum E { e = E() }; // #cwg977-E
 #if !defined(_WIN32) || defined(__MINGW32__)
 // expected-error@#cwg977-E {{invalid use of incomplete type 'E'}}
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 6ed5e3d37bae7..a7f41d6a68686 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -63,7 +63,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/3.html">3</a></td>
     <td>NAD</td>
     <td>The template compilation model rules render some explicit specialization declarations not visible during instantiation</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="4">
     <td><a href="https://cplusplus.github.io/CWG/issues/4.html">4</a></td>
@@ -111,7 +111,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/11.html">11</a></td>
     <td>CD1</td>
     <td>How do the keywords typename/template interact with using-declarations?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="12">
     <td><a href="https://cplusplus.github.io/CWG/issues/12.html">12</a></td>
@@ -135,7 +135,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/15.html">15</a></td>
     <td>dup</td>
     <td>Default arguments for parameters of function templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="16">
     <td><a href="https://cplusplus.github.io/CWG/issues/16.html">16</a></td>
@@ -147,7 +147,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/17.html">17</a></td>
     <td>NAD</td>
     <td>Footnote 99 should discuss the naming class when describing members that can be accessed from friends</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="18">
     <td><a href="https://cplusplus.github.io/CWG/issues/18.html">18</a></td>
@@ -183,7 +183,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/23.html">23</a></td>
     <td>NAD</td>
     <td>Some questions regarding partial ordering of function templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="24">
     <td><a href="https://cplusplus.github.io/CWG/issues/24.html">24</a></td>
@@ -195,19 +195,19 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/25.html">25</a></td>
     <td>TC1</td>
     <td>Exception specifications and pointers to members</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="26">
     <td><a href="https://cplusplus.github.io/CWG/issues/26.html">26</a></td>
     <td>NAD</td>
     <td>Copy constructors and default arguments</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="27">
     <td><a href="https://cplusplus.github.io/CWG/issues/27.html">27</a></td>
     <td>NAD</td>
     <td>Overload ambiguities for builtin ?: prototypes</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="28">
     <td><a href="https://cplusplus.github.io/CWG/issues/28.html">28</a></td>
@@ -273,7 +273,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/38.html">38</a></td>
     <td>TC1</td>
     <td>Explicit template arguments and operator functions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="39">
     <td><a href="https://cplusplus.github.io/CWG/issues/39.html">39</a></td>
@@ -291,13 +291,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/41.html">41</a></td>
     <td>TC1</td>
     <td>Clarification of lookup of names after declarator-id</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="42">
     <td><a href="https://cplusplus.github.io/CWG/issues/42.html">42</a></td>
     <td>NAD</td>
     <td>Redefining names from base classes</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="43">
     <td><a href="https://cplusplus.github.io/CWG/issues/43.html">43</a></td>
@@ -315,13 +315,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/45.html">45</a></td>
     <td>CD1</td>
     <td>Access to nested classes</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="46">
     <td><a href="https://cplusplus.github.io/CWG/issues/46.html">46</a></td>
     <td>NAD</td>
     <td>Explicit instantiation of member templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="47">
     <td><a href="https://cplusplus.github.io/CWG/issues/47.html">47</a></td>
@@ -333,7 +333,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/48.html">48</a></td>
     <td>TC1</td>
     <td>Definitions of unused static members</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="49">
     <td><a href="https://cplusplus.github.io/CWG/issues/49.html">49</a></td>
@@ -345,7 +345,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/50.html">50</a></td>
     <td>NAD</td>
     <td>Converting pointer to incomplete type to same type</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="51">
     <td><a href="https://cplusplus.github.io/CWG/issues/51.html">51</a></td>
@@ -363,7 +363,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/53.html">53</a></td>
     <td>TC1</td>
     <td>Lvalue-to-rvalue conversion before certain static_casts</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="54">
     <td><a href="https://cplusplus.github.io/CWG/issues/54.html">54</a></td>
@@ -375,13 +375,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/55.html">55</a></td>
     <td>NAD</td>
     <td>Adding/subtracting pointer and enumeration value</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="56">
     <td><a href="https://cplusplus.github.io/CWG/issues/56.html">56</a></td>
     <td>TC1</td>
     <td>Redeclaring typedefs within classes</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="57">
     <td><a href="https://cplusplus.github.io/CWG/issues/57.html">57</a></td>
@@ -399,13 +399,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/59.html">59</a></td>
     <td>TC1</td>
     <td>Clarification of overloading and UDC to reference type</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="60">
     <td><a href="https://cplusplus.github.io/CWG/issues/60.html">60</a></td>
     <td>CD1</td>
     <td>Reference binding and valid conversion sequences</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="61">
     <td><a href="https://cplusplus.github.io/CWG/issues/61.html">61</a></td>
@@ -423,13 +423,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/63.html">63</a></td>
     <td>CD1</td>
     <td>Class instantiation from pointer conversion to void*, null and self</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="64">
     <td><a href="https://cplusplus.github.io/CWG/issues/64.html">64</a></td>
     <td>TC1</td>
     <td>Partial ordering to disambiguate explicit specialization</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="65">
     <td><a href="https://cplusplus.github.io/CWG/issues/65.html">65</a></td>
@@ -465,7 +465,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/70.html">70</a></td>
     <td>CD1</td>
     <td>Is an array bound a nondeduced context?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="71">
     <td><a href="https://cplusplus.github.io/CWG/issues/71.html">71</a></td>
@@ -489,25 +489,25 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/74.html">74</a></td>
     <td>TC1</td>
     <td>Enumeration value in direct-new-declarator</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="75">
     <td><a href="https://cplusplus.github.io/CWG/issues/75.html">75</a></td>
     <td>TC1</td>
     <td>In-class initialized members must be const</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="76">
     <td><a href="https://cplusplus.github.io/CWG/issues/76.html">76</a></td>
     <td>TC1</td>
     <td>Are const volatile variables considered "constant expressions"?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="77">
     <td><a href="https://cplusplus.github.io/CWG/issues/77.html">77</a></td>
     <td>CD1</td>
     <td>The definition of friend does not allow nested classes to be friends</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="78">
     <td><a href="https://cplusplus.github.io/CWG/issues/78.html">78</a></td>
@@ -543,13 +543,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/83.html">83</a></td>
     <td>TC1</td>
     <td>Overloading and deprecated conversion of string literal</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="84">
     <td><a href="https://cplusplus.github.io/CWG/issues/84.html">84</a></td>
     <td>TC1</td>
     <td>Overloading and conversion loophole used by <TT>auto_ptr</TT></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="85">
     <td><a href="https://cplusplus.github.io/CWG/issues/85.html">85</a></td>
@@ -585,13 +585,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/90.html">90</a></td>
     <td>TC1</td>
     <td>Should the enclosing class be an "associated class" too?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="91">
     <td><a href="https://cplusplus.github.io/CWG/issues/91.html">91</a></td>
     <td>NAD</td>
     <td>A union's associated types should include the union itself</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="92">
     <td><a href="https://cplusplus.github.io/CWG/issues/92.html">92</a></td>
@@ -609,7 +609,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/94.html">94</a></td>
     <td>TC1</td>
     <td>Inconsistencies in the descriptions of constant expressions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="95">
     <td><a href="https://cplusplus.github.io/CWG/issues/95.html">95</a></td>
@@ -627,13 +627,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/97.html">97</a></td>
     <td>NAD</td>
     <td>Use of bool constants in integral constant expressions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="98">
     <td><a href="https://cplusplus.github.io/CWG/issues/98.html">98</a></td>
     <td>TC1</td>
     <td>Branching into try block</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="99">
     <td><a href="https://cplusplus.github.io/CWG/issues/99.html">99</a></td>
@@ -645,7 +645,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/100.html">100</a></td>
     <td>TC1</td>
     <td>Clarify why string literals are not allowed as template arguments</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="101">
     <td><a href="https://cplusplus.github.io/CWG/issues/101.html">101</a></td>
@@ -657,7 +657,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/102.html">102</a></td>
     <td>NAD</td>
     <td>Operator lookup rules do not work well with parts of the library</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="103">
     <td><a href="https://cplusplus.github.io/CWG/issues/103.html">103</a></td>
@@ -687,7 +687,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/107.html">107</a></td>
     <td>NAD</td>
     <td>Linkage of operator functions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="108">
     <td><a href="https://cplusplus.github.io/CWG/issues/108.html">108</a></td>
@@ -699,7 +699,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/109.html">109</a></td>
     <td>NAD</td>
     <td>Allowing <TT>::template</TT> in <I>using-declaration</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="110">
     <td><a href="https://cplusplus.github.io/CWG/issues/110.html">110</a></td>
@@ -717,19 +717,19 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/112.html">112</a></td>
     <td>CD1</td>
     <td>Array types and cv-qualifiers</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="113">
     <td><a href="https://cplusplus.github.io/CWG/issues/113.html">113</a></td>
     <td>CD1</td>
     <td>Visibility of called function</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="114">
     <td><a href="https://cplusplus.github.io/CWG/issues/114.html">114</a></td>
     <td>NAD</td>
     <td>Virtual overriding by template member function specializations</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="115">
     <td><a href="https://cplusplus.github.io/CWG/issues/115.html">115</a></td>
@@ -741,7 +741,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/116.html">116</a></td>
     <td>TC1</td>
     <td>Equivalent and functionally-equivalent function templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="117">
     <td><a href="https://cplusplus.github.io/CWG/issues/117.html">117</a></td>
@@ -771,13 +771,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/121.html">121</a></td>
     <td>TC1</td>
     <td>Dependent type names with non-dependent <I>nested-name-specifier</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="122">
     <td><a href="https://cplusplus.github.io/CWG/issues/122.html">122</a></td>
     <td>CD1</td>
     <td><I>template-id</I>s as <I>unqualified-id</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="123">
     <td><a href="https://cplusplus.github.io/CWG/issues/123.html">123</a></td>
@@ -795,7 +795,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/125.html">125</a></td>
     <td>CD1</td>
     <td>Ambiguity in <TT>friend</TT> declaration syntax</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="126">
     <td><a href="https://cplusplus.github.io/CWG/issues/126.html">126</a></td>
@@ -813,7 +813,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/128.html">128</a></td>
     <td>TC1</td>
     <td>Casting between enum types</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="129">
     <td><a href="https://cplusplus.github.io/CWG/issues/129.html">129</a></td>
@@ -855,7 +855,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/135.html">135</a></td>
     <td>TC1</td>
     <td>Class type in in-class member function definitions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="136">
     <td><a href="https://cplusplus.github.io/CWG/issues/136.html">136</a></td>
@@ -867,7 +867,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/137.html">137</a></td>
     <td>TC1</td>
     <td><TT>static_cast</TT> of <I>cv</I> <TT>void*</TT></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="138">
     <td><a href="https://cplusplus.github.io/CWG/issues/138.html">138</a></td>
@@ -879,13 +879,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/139.html">139</a></td>
     <td>CD1</td>
     <td>Error in <TT>friend</TT> lookup example</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="140">
     <td><a href="https://cplusplus.github.io/CWG/issues/140.html">140</a></td>
     <td>CD1</td>
     <td>Agreement of parameter declarations</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="141">
     <td><a href="https://cplusplus.github.io/CWG/issues/141.html">141</a></td>
@@ -903,7 +903,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/143.html">143</a></td>
     <td>CD1</td>
     <td>Friends and Koenig lookup</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="144">
     <td><a href="https://cplusplus.github.io/CWG/issues/144.html">144</a></td>
@@ -915,7 +915,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/145.html">145</a></td>
     <td>TC1</td>
     <td>Deprecation of prefix <TT>++</TT></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="146">
     <td><a href="https://cplusplus.github.io/CWG/issues/146.html">146</a></td>
@@ -927,13 +927,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/147.html">147</a></td>
     <td>TC1</td>
     <td>Naming the constructor</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="148">
     <td><a href="https://cplusplus.github.io/CWG/issues/148.html">148</a></td>
     <td>TC1</td>
     <td>POD classes and pointers to members</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="149">
     <td><a href="https://cplusplus.github.io/CWG/issues/149.html">149</a></td>
@@ -957,7 +957,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/152.html">152</a></td>
     <td>TC1</td>
     <td><TT>explicit</TT> copy constructors</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="153">
     <td><a href="https://cplusplus.github.io/CWG/issues/153.html">153</a></td>
@@ -969,7 +969,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/154.html">154</a></td>
     <td>NAD</td>
     <td>Anonymous unions in unnamed namespaces</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="155">
     <td><a href="https://cplusplus.github.io/CWG/issues/155.html">155</a></td>
@@ -1029,7 +1029,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/164.html">164</a></td>
     <td>TC1</td>
     <td>Overlap between Koenig and normal lookup</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="165">
     <td><a href="https://cplusplus.github.io/CWG/issues/165.html">165</a></td>
@@ -1059,7 +1059,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/169.html">169</a></td>
     <td>NAD</td>
     <td><I>template-id</I>s in <I>using-declaration</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="170">
     <td><a href="https://cplusplus.github.io/CWG/issues/170.html">170</a></td>
@@ -1077,13 +1077,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/172.html">172</a></td>
     <td>CD1</td>
     <td>Unsigned int as underlying type of enum</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="173">
     <td><a href="https://cplusplus.github.io/CWG/issues/173.html">173</a></td>
     <td>TC1</td>
     <td>Constraints on execution character set</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="174">
     <td><a href="https://cplusplus.github.io/CWG/issues/174.html">174</a></td>
@@ -1107,19 +1107,19 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/177.html">177</a></td>
     <td>CD1</td>
     <td>Lvalues vs rvalues in copy-initialization</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="178">
     <td><a href="https://cplusplus.github.io/CWG/issues/178.html">178</a></td>
     <td>TC1</td>
     <td>More on value-initialization</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="179">
     <td><a href="https://cplusplus.github.io/CWG/issues/179.html">179</a></td>
     <td>TC1</td>
     <td>Function pointers and subtraction</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="180">
     <td><a href="https://cplusplus.github.io/CWG/issues/180.html">180</a></td>
@@ -1131,7 +1131,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/181.html">181</a></td>
     <td>TC1</td>
     <td>Errors in template <I>template-parameter</I> example</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="182">
     <td><a href="https://cplusplus.github.io/CWG/issues/182.html">182</a></td>
@@ -1149,7 +1149,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/184.html">184</a></td>
     <td>CD1</td>
     <td>Default arguments in template <I>template-parameter</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="185">
     <td><a href="https://cplusplus.github.io/CWG/issues/185.html">185</a></td>
@@ -1173,7 +1173,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/188.html">188</a></td>
     <td>TC1</td>
     <td>Comma operator and rvalue conversion</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="189">
     <td><a href="https://cplusplus.github.io/CWG/issues/189.html">189</a></td>
@@ -1191,7 +1191,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/191.html">191</a></td>
     <td>CD6</td>
     <td>Name lookup does not handle complex nesting</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="192">
     <td><a href="https://cplusplus.github.io/CWG/issues/192.html">192</a></td>
@@ -1209,13 +1209,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/194.html">194</a></td>
     <td>TC1</td>
     <td>Identifying constructors</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="195">
     <td><a href="https://cplusplus.github.io/CWG/issues/195.html">195</a></td>
     <td>CD1</td>
     <td>Converting between function and object pointers</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="196">
     <td><a href="https://cplusplus.github.io/CWG/issues/196.html">196</a></td>
@@ -1227,13 +1227,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/197.html">197</a></td>
     <td>CD1</td>
     <td>Issues with two-stage lookup of dependent names</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="198">
     <td><a href="https://cplusplus.github.io/CWG/issues/198.html">198</a></td>
     <td>CD1</td>
     <td>Definition of "use" in local and nested classes</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="199">
     <td><a href="https://cplusplus.github.io/CWG/issues/199.html">199</a></td>
@@ -1281,13 +1281,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/206.html">206</a></td>
     <td>TC1</td>
     <td>Semantic constraints on non-dependent names</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="207">
     <td><a href="https://cplusplus.github.io/CWG/issues/207.html">207</a></td>
     <td>CD1</td>
     <td><I>using-declaration</I>s and protected access</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="208">
     <td><a href="https://cplusplus.github.io/CWG/issues/208.html">208</a></td>
@@ -1312,7 +1312,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/211.html">211</a></td>
     <td>NAD</td>
     <td>Constructors should not be allowed to return normally after an exception</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="212">
     <td><a href="https://cplusplus.github.io/CWG/issues/212.html">212</a></td>
@@ -1324,13 +1324,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/213.html">213</a></td>
     <td>TC1</td>
     <td>Lookup in dependent base classes</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="214">
     <td><a href="https://cplusplus.github.io/CWG/issues/214.html">214</a></td>
     <td>CD1</td>
     <td>Partial ordering of function templates is underspecified</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="215">
     <td><a href="https://cplusplus.github.io/CWG/issues/215.html">215</a></td>
@@ -1348,13 +1348,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/217.html">217</a></td>
     <td>TC1</td>
     <td>Default arguments for non-template member functions of class templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="218">
     <td><a href="https://cplusplus.github.io/CWG/issues/218.html">218</a></td>
     <td>CD1</td>
     <td>Specification of Koenig lookup</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="219">
     <td><a href="https://cplusplus.github.io/CWG/issues/219.html">219</a></td>
@@ -1408,13 +1408,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/227.html">227</a></td>
     <td>TC1</td>
     <td>How many scopes in an <TT>if</TT> statement?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="228">
     <td><a href="https://cplusplus.github.io/CWG/issues/228.html">228</a></td>
     <td>CD1</td>
     <td>Use of <TT>template</TT> keyword with non-member templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="229">
     <td><a href="https://cplusplus.github.io/CWG/issues/229.html">229</a></td>
@@ -1432,7 +1432,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/231.html">231</a></td>
     <td>NAD</td>
     <td>Visibility of names after <I>using-directive</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="232">
     <td><a href="https://cplusplus.github.io/CWG/issues/232.html">232</a></td>
@@ -1480,7 +1480,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/239.html">239</a></td>
     <td>CD1</td>
     <td>Footnote 116 and Koenig lookup</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="240">
     <td><a href="https://cplusplus.github.io/CWG/issues/240.html">240</a></td>
@@ -1492,7 +1492,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/241.html">241</a></td>
     <td>TC1</td>
     <td>Error in example in 14.8.1</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="242">
     <td><a href="https://cplusplus.github.io/CWG/issues/242.html">242</a></td>
@@ -1504,7 +1504,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/243.html">243</a></td>
     <td>NAD</td>
     <td>Weighting of conversion functions in direct-initialization</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="244">
     <td><a href="https://cplusplus.github.io/CWG/issues/244.html">244</a></td>
@@ -1516,7 +1516,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/245.html">245</a></td>
     <td>CD1</td>
     <td>Name lookup in <I>elaborated-type-specifier</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="246">
     <td><a href="https://cplusplus.github.io/CWG/issues/246.html">246</a></td>
@@ -1528,7 +1528,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/247.html">247</a></td>
     <td>NAD</td>
     <td>Pointer-to-member casts and function overload resolution</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="248">
     <td><a href="https://cplusplus.github.io/CWG/issues/248.html">248</a></td>
@@ -1540,13 +1540,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/249.html">249</a></td>
     <td>TC1</td>
     <td>What is a member function template?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="250">
     <td><a href="https://cplusplus.github.io/CWG/issues/250.html">250</a></td>
     <td>TC1</td>
     <td>Address of function template specialization with non-deduced template arguments</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="251">
     <td><a href="https://cplusplus.github.io/CWG/issues/251.html">251</a></td>
@@ -1576,7 +1576,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/255.html">255</a></td>
     <td>CD6</td>
     <td>Placement deallocation functions and lookup ambiguity</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="256">
     <td><a href="https://cplusplus.github.io/CWG/issues/256.html">256</a></td>
@@ -1618,7 +1618,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/262.html">262</a></td>
     <td>CD1</td>
     <td>Default arguments and ellipsis</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="263">
     <td><a href="https://cplusplus.github.io/CWG/issues/263.html">263</a></td>
@@ -1679,13 +1679,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/272.html">272</a></td>
     <td>CD1</td>
     <td>Explicit destructor invocation and <I>qualified-id</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="273">
     <td><a href="https://cplusplus.github.io/CWG/issues/273.html">273</a></td>
     <td>CD1</td>
     <td>POD classes and <TT>operator&amp;()</TT></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="274">
     <td><a href="https://cplusplus.github.io/CWG/issues/274.html">274</a></td>
@@ -1745,7 +1745,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/283.html">283</a></td>
     <td>CD1</td>
     <td>Template <I>type-parameter</I>s are not syntactically <I>type-name</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="284">
     <td><a href="https://cplusplus.github.io/CWG/issues/284.html">284</a></td>
@@ -1757,7 +1757,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/285.html">285</a></td>
     <td>NAD</td>
     <td>Identifying a function template being specialized</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="286">
     <td><a href="https://cplusplus.github.io/CWG/issues/286.html">286</a></td>
@@ -1781,7 +1781,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/289.html">289</a></td>
     <td>CD1</td>
     <td>Incomplete list of contexts requiring a complete type</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="290">
     <td><a href="https://cplusplus.github.io/CWG/issues/290.html">290</a></td>
@@ -1823,7 +1823,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/296.html">296</a></td>
     <td>CD1</td>
     <td>Can conversion functions be static?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="297">
     <td><a href="https://cplusplus.github.io/CWG/issues/297.html">297</a></td>
@@ -1847,7 +1847,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/300.html">300</a></td>
     <td>CD1</td>
     <td>References to functions in template argument deduction</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="301">
     <td><a href="https://cplusplus.github.io/CWG/issues/301.html">301</a></td>
@@ -1967,7 +1967,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/320.html">320</a></td>
     <td>CD1</td>
     <td>Question on copy constructor elision example</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="321">
     <td><a href="https://cplusplus.github.io/CWG/issues/321.html">321</a></td>
@@ -2015,7 +2015,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/328.html">328</a></td>
     <td>CD1</td>
     <td>Missing requirement that class member types be complete</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="329">
     <td><a href="https://cplusplus.github.io/CWG/issues/329.html">329</a></td>
@@ -2045,13 +2045,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/333.html">333</a></td>
     <td>NAD</td>
     <td>Ambiguous use of "declaration" in disambiguation section</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="334">
     <td><a href="https://cplusplus.github.io/CWG/issues/334.html">334</a></td>
     <td>NAD</td>
     <td>Is a comma-expression dependent if its first operand is?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="335">
     <td><a href="https://cplusplus.github.io/CWG/issues/335.html">335</a></td>
@@ -2063,13 +2063,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/336.html">336</a></td>
     <td>CD1</td>
     <td>Explicit specialization examples are still incorrect</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="337">
     <td><a href="https://cplusplus.github.io/CWG/issues/337.html">337</a></td>
     <td>CD1</td>
     <td>Attempt to create array of abtract type should cause deduction to fail</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="338">
     <td><a href="https://cplusplus.github.io/CWG/issues/338.html">338</a></td>
@@ -2087,7 +2087,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/340.html">340</a></td>
     <td>NAD</td>
     <td>Unclear wording in disambiguation section</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="341">
     <td><a href="https://cplusplus.github.io/CWG/issues/341.html">341</a></td>
@@ -2117,7 +2117,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/345.html">345</a></td>
     <td>CD1</td>
     <td>Misleading comment on example in templates chapter</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="346">
     <td><a href="https://cplusplus.github.io/CWG/issues/346.html">346</a></td>
@@ -2129,7 +2129,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/347.html">347</a></td>
     <td>NAD</td>
     <td>Use of derived class name in defining base class nested class</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="348">
     <td><a href="https://cplusplus.github.io/CWG/issues/348.html">348</a></td>
@@ -2171,13 +2171,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/354.html">354</a></td>
     <td>CD1</td>
     <td>Null as nontype template argument</td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="full" align="center">Clang 3.1 (C++11 onwards)</td>
   </tr>
   <tr id="355">
     <td><a href="https://cplusplus.github.io/CWG/issues/355.html">355</a></td>
     <td>C++11</td>
     <td>Global-scope <TT>::</TT> in <I>nested-name-specifier</I></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="356">
     <td><a href="https://cplusplus.github.io/CWG/issues/356.html">356</a></td>
@@ -2189,25 +2189,25 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/357.html">357</a></td>
     <td>CD1</td>
     <td>Definition of signature should include name</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="358">
     <td><a href="https://cplusplus.github.io/CWG/issues/358.html">358</a></td>
     <td>NAD</td>
     <td>Namespaces and extern "C"</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="359">
     <td><a href="https://cplusplus.github.io/CWG/issues/359.html">359</a></td>
     <td>NAD</td>
     <td>Type definition in anonymous union</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="360">
     <td><a href="https://cplusplus.github.io/CWG/issues/360.html">360</a></td>
     <td>CD6</td>
     <td>Using-declaration that reduces access</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr class="open" id="361">
     <td><a href="https://cplusplus.github.io/CWG/issues/361.html">361</a></td>
@@ -2231,7 +2231,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/364.html">364</a></td>
     <td>CD1</td>
     <td>Calling overloaded function with static in set, with no object</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="365">
     <td><a href="https://cplusplus.github.io/CWG/issues/365.html">365</a></td>
@@ -2243,13 +2243,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/366.html">366</a></td>
     <td>CD1</td>
     <td>String literal allowed in integral constant expression?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="367">
     <td><a href="https://cplusplus.github.io/CWG/issues/367.html">367</a></td>
     <td>CD1</td>
     <td><TT>throw</TT> operator allowed in constant expression?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="368">
     <td><a href="https://cplusplus.github.io/CWG/issues/368.html">368</a></td>
@@ -2309,7 +2309,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/377.html">377</a></td>
     <td>CD1</td>
     <td>Enum whose enumerators will not fit in any integral type</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="378">
     <td><a href="https://cplusplus.github.io/CWG/issues/378.html">378</a></td>
@@ -2333,25 +2333,25 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/381.html">381</a></td>
     <td>CD1</td>
     <td>Incorrect example of base class member lookup</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="382">
     <td><a href="https://cplusplus.github.io/CWG/issues/382.html">382</a></td>
     <td>CD1</td>
     <td>Allow <TT>typename</TT> outside of templates</td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="383">
     <td><a href="https://cplusplus.github.io/CWG/issues/383.html">383</a></td>
     <td>CD1</td>
     <td>Is a class with a declared but not defined destructor a POD?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="384">
     <td><a href="https://cplusplus.github.io/CWG/issues/384.html">384</a></td>
     <td>NAD</td>
     <td>Argument-dependent lookup and operator functions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="385">
     <td><a href="https://cplusplus.github.io/CWG/issues/385.html">385</a></td>
@@ -2423,7 +2423,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/396.html">396</a></td>
     <td>CD1</td>
     <td>Misleading note regarding use of <TT>auto</TT> for disambiguation</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="397">
     <td><a href="https://cplusplus.github.io/CWG/issues/397.html">397</a></td>
@@ -2435,7 +2435,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/398.html">398</a></td>
     <td>CD1</td>
     <td>Ambiguous wording on naming a type in deduction</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="399">
     <td><a href="https://cplusplus.github.io/CWG/issues/399.html">399</a></td>
@@ -2447,7 +2447,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/400.html">400</a></td>
     <td>CD1</td>
     <td>Using-declarations and the "struct hack"</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="401">
     <td><a href="https://cplusplus.github.io/CWG/issues/401.html">401</a></td>
@@ -2465,7 +2465,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/403.html">403</a></td>
     <td>CD1</td>
     <td>Reference to a type as a <I>template-id</I></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="404">
     <td><a href="https://cplusplus.github.io/CWG/issues/404.html">404</a></td>
@@ -2477,7 +2477,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/405.html">405</a></td>
     <td>CD6</td>
     <td>Unqualified function name lookup</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="406">
     <td><a href="https://cplusplus.github.io/CWG/issues/406.html">406</a></td>
@@ -2501,7 +2501,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/409.html">409</a></td>
     <td>CD1</td>
     <td>Obsolete paragraph missed by changes for issue 224</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="410">
     <td><a href="https://cplusplus.github.io/CWG/issues/410.html">410</a></td>
@@ -2525,7 +2525,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/413.html">413</a></td>
     <td>CD1</td>
     <td>Definition of "empty class"</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="414">
     <td><a href="https://cplusplus.github.io/CWG/issues/414.html">414</a></td>
@@ -2537,13 +2537,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/415.html">415</a></td>
     <td>CD1</td>
     <td>Template deduction does not cause instantiation</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="416">
     <td><a href="https://cplusplus.github.io/CWG/issues/416.html">416</a></td>
     <td>CD1</td>
     <td>Class must be complete to allow operator lookup?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="417">
     <td><a href="https://cplusplus.github.io/CWG/issues/417.html">417</a></td>
@@ -2573,31 +2573,31 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/421.html">421</a></td>
     <td>CD1</td>
     <td>Is rvalue.field an rvalue?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="422">
     <td><a href="https://cplusplus.github.io/CWG/issues/422.html">422</a></td>
     <td>NAD</td>
     <td>Is a typedef redeclaration allowed with a template type that might be the same?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="423">
     <td><a href="https://cplusplus.github.io/CWG/issues/423.html">423</a></td>
     <td>NAD</td>
     <td>Can a conversion be done on the left operand of a compound assignment?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="424">
     <td><a href="https://cplusplus.github.io/CWG/issues/424.html">424</a></td>
     <td>CD1</td>
     <td>Wording problem with issue 56 resolution on redeclaring typedefs in class scope</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="425">
     <td><a href="https://cplusplus.github.io/CWG/issues/425.html">425</a></td>
     <td>CD1</td>
     <td>Set of candidates for overloaded built-in operator with float operand</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="426">
     <td><a href="https://cplusplus.github.io/CWG/issues/426.html">426</a></td>
@@ -2609,13 +2609,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/427.html">427</a></td>
     <td>CD1</td>
     <td><TT>static_cast</TT> ambiguity: conversion versus cast to derived</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="428">
     <td><a href="https://cplusplus.github.io/CWG/issues/428.html">428</a></td>
     <td>CD1</td>
     <td>Mention of expression with reference type</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="429">
     <td><a href="https://cplusplus.github.io/CWG/issues/429.html">429</a></td>
@@ -2627,13 +2627,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/430.html">430</a></td>
     <td>CD1</td>
     <td>Ordering of expression evaluation in initializer list</td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="431">
     <td><a href="https://cplusplus.github.io/CWG/issues/431.html">431</a></td>
     <td>C++11</td>
     <td>Defect in wording in 14.2</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="432">
     <td><a href="https://cplusplus.github.io/CWG/issues/432.html">432</a></td>
@@ -2645,7 +2645,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/433.html">433</a></td>
     <td>CD1</td>
     <td>Do elaborated type specifiers in templates inject into enclosing namespace scope?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="434">
     <td><a href="https://cplusplus.github.io/CWG/issues/434.html">434</a></td>
@@ -2663,7 +2663,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/436.html">436</a></td>
     <td>CD1</td>
     <td>Problem in example in 9.6 paragraph 4</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="437">
     <td><a href="https://cplusplus.github.io/CWG/issues/437.html">437</a></td>
@@ -2711,7 +2711,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/444.html">444</a></td>
     <td>NAD</td>
     <td>Overriding and the generated copy assignment operator</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="445">
     <td><a href="https://cplusplus.github.io/CWG/issues/445.html">445</a></td>
@@ -2729,7 +2729,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/447.html">447</a></td>
     <td>CD1</td>
     <td>Is offsetof type-dependent?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="448">
     <td><a href="https://cplusplus.github.io/CWG/issues/448.html">448</a></td>
@@ -2747,19 +2747,19 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/450.html">450</a></td>
     <td>CD1</td>
     <td>Binding a reference to const to a cv-qualified array rvalue</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="451">
     <td><a href="https://cplusplus.github.io/CWG/issues/451.html">451</a></td>
     <td>CD1</td>
     <td>Expressions with invalid results and ill-formedness</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="452">
     <td><a href="https://cplusplus.github.io/CWG/issues/452.html">452</a></td>
     <td>CD1</td>
     <td>Wording nit on description of <TT>this</TT></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="453">
     <td><a href="https://cplusplus.github.io/CWG/issues/453.html">453</a></td>
@@ -2783,13 +2783,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/456.html">456</a></td>
     <td>NAD</td>
     <td>Is initialized const int or const bool variable a null pointer constant?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="457">
     <td><a href="https://cplusplus.github.io/CWG/issues/457.html">457</a></td>
     <td>CD1</td>
     <td>Wording nit on use of const variables in constant expressions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="458">
     <td><a href="https://cplusplus.github.io/CWG/issues/458.html">458</a></td>
@@ -2807,7 +2807,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/460.html">460</a></td>
     <td>CD1</td>
     <td>Can a <I>using-declaration</I> name a namespace?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="461">
     <td><a href="https://cplusplus.github.io/CWG/issues/461.html">461</a></td>
@@ -2849,13 +2849,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/467.html">467</a></td>
     <td>NAD</td>
     <td>Jump past initialization of local static variable</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="468">
     <td><a href="https://cplusplus.github.io/CWG/issues/468.html">468</a></td>
     <td>CD1</td>
     <td>Allow <TT>::template</TT> outside of templates</td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="469">
     <td><a href="https://cplusplus.github.io/CWG/issues/469.html">469</a></td>
@@ -2867,7 +2867,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/470.html">470</a></td>
     <td>CD1</td>
     <td>Instantiation of members of an explicitly-instantiated class template</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="471">
     <td><a href="https://cplusplus.github.io/CWG/issues/471.html">471</a></td>
@@ -2919,7 +2919,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/478.html">478</a></td>
     <td>NAD</td>
     <td>May a function parameter be an array of an abstract class type?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="479">
     <td><a href="https://cplusplus.github.io/CWG/issues/479.html">479</a></td>
@@ -2931,7 +2931,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/480.html">480</a></td>
     <td>CD1</td>
     <td>Is a base of a virtual base also virtual?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="481">
     <td><a href="https://cplusplus.github.io/CWG/issues/481.html">481</a></td>
@@ -2949,37 +2949,37 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/483.html">483</a></td>
     <td>CD3</td>
     <td>Normative requirements on integral ranges</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="484">
     <td><a href="https://cplusplus.github.io/CWG/issues/484.html">484</a></td>
     <td>CD1</td>
     <td>Can a <I>base-specifier</I> name a cv-qualified class type?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="485">
     <td><a href="https://cplusplus.github.io/CWG/issues/485.html">485</a></td>
     <td>CD1</td>
     <td>What is a &#8220;name&#8221;?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="486">
     <td><a href="https://cplusplus.github.io/CWG/issues/486.html">486</a></td>
     <td>CD1</td>
     <td>Invalid return types and template argument deduction</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="487">
     <td><a href="https://cplusplus.github.io/CWG/issues/487.html">487</a></td>
     <td>NAD</td>
     <td>Operator overloading in constant expressions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="488">
     <td><a href="https://cplusplus.github.io/CWG/issues/488.html">488</a></td>
     <td>CD1</td>
     <td>Local types, overload resolution, and template argument deduction</td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="full" align="center">Clang 2.9 (C++11 onwards)</td>
   </tr>
   <tr id="489">
     <td><a href="https://cplusplus.github.io/CWG/issues/489.html">489</a></td>
@@ -3045,7 +3045,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/499.html">499</a></td>
     <td>CD2</td>
     <td>Throwing an array of unknown size</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="500">
     <td><a href="https://cplusplus.github.io/CWG/issues/500.html">500</a></td>
@@ -3057,13 +3057,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/501.html">501</a></td>
     <td>NAD</td>
     <td>Visibility of friend declarations within the befriending class</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="502">
     <td><a href="https://cplusplus.github.io/CWG/issues/502.html">502</a></td>
     <td>C++11</td>
     <td>Dependency of nested enumerations and enumerators</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="503">
     <td><a href="https://cplusplus.github.io/CWG/issues/503.html">503</a></td>
@@ -3081,13 +3081,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/505.html">505</a></td>
     <td>CD1</td>
     <td>Conditionally-supported behavior for unknown character escapes</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="506">
     <td><a href="https://cplusplus.github.io/CWG/issues/506.html">506</a></td>
     <td>CD1</td>
     <td>Conditionally-supported behavior for non-POD objects passed to ellipsis</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="507">
     <td><a href="https://cplusplus.github.io/CWG/issues/507.html">507</a></td>
@@ -3123,7 +3123,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/512.html">512</a></td>
     <td>NAD</td>
     <td>Union members with user-declared non-default constructors</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="513">
     <td><a href="https://cplusplus.github.io/CWG/issues/513.html">513</a></td>
@@ -3135,7 +3135,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/514.html">514</a></td>
     <td>CD1</td>
     <td>Is the initializer for a namespace member in the scope of the namespace?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="515">
     <td><a href="https://cplusplus.github.io/CWG/issues/515.html">515</a></td>
@@ -3159,7 +3159,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/518.html">518</a></td>
     <td>CD1</td>
     <td>Trailing comma following <I>enumerator-list</I></td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="519">
     <td><a href="https://cplusplus.github.io/CWG/issues/519.html">519</a></td>
@@ -3183,7 +3183,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/522.html">522</a></td>
     <td>CD1</td>
     <td>Array-to-pointer decay in template argument deduction</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="523">
     <td><a href="https://cplusplus.github.io/CWG/issues/523.html">523</a></td>
@@ -3195,19 +3195,19 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/524.html">524</a></td>
     <td>CD1</td>
     <td>Can function-notation calls to operator functions be dependent?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="525">
     <td><a href="https://cplusplus.github.io/CWG/issues/525.html">525</a></td>
     <td>CD1</td>
     <td>Missing <TT>*</TT> in example</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="526">
     <td><a href="https://cplusplus.github.io/CWG/issues/526.html">526</a></td>
     <td>CD1</td>
     <td>Confusing aspects in the specification of non-deduced contexts</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="527">
     <td><a href="https://cplusplus.github.io/CWG/issues/527.html">527</a></td>
@@ -3231,7 +3231,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/530.html">530</a></td>
     <td>CD1</td>
     <td>Nontype template arguments in constant expressions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="531">
     <td><a href="https://cplusplus.github.io/CWG/issues/531.html">531</a></td>
@@ -3261,7 +3261,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/535.html">535</a></td>
     <td>CD3</td>
     <td>Copy construction without a copy constructor</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="536">
     <td><a href="https://cplusplus.github.io/CWG/issues/536.html">536</a></td>
@@ -3287,25 +3287,25 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/539.html">539</a></td>
     <td>CD3</td>
     <td>Constraints on <I>type-specifier-seq</I></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="540">
     <td><a href="https://cplusplus.github.io/CWG/issues/540.html">540</a></td>
     <td>CD1</td>
     <td>Propagation of cv-qualifiers in reference-to-reference collapse</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="541">
     <td><a href="https://cplusplus.github.io/CWG/issues/541.html">541</a></td>
     <td>CD2</td>
     <td>Dependent function types</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="542">
     <td><a href="https://cplusplus.github.io/CWG/issues/542.html">542</a></td>
     <td>CD2</td>
     <td>Value initialization of arrays of POD-structs</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="543">
     <td><a href="https://cplusplus.github.io/CWG/issues/543.html">543</a></td>
@@ -3317,7 +3317,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/544.html">544</a></td>
     <td>NAD</td>
     <td>Base class lookup in explicit specialization</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="545">
     <td><a href="https://cplusplus.github.io/CWG/issues/545.html">545</a></td>
@@ -3329,7 +3329,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/546.html">546</a></td>
     <td>C++11</td>
     <td>Explicit instantiation of class template members</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="547">
     <td><a href="https://cplusplus.github.io/CWG/issues/547.html">547</a></td>
@@ -3359,13 +3359,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/551.html">551</a></td>
     <td>CD1</td>
     <td>When is <TT>inline</TT> permitted in an explicit instantiation?</td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="552">
     <td><a href="https://cplusplus.github.io/CWG/issues/552.html">552</a></td>
     <td>NAD</td>
     <td>Use of <TT>typename</TT> in the type in a non-type <I>parameter-declaration</I></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="553">
     <td><a href="https://cplusplus.github.io/CWG/issues/553.html">553</a></td>
@@ -3407,7 +3407,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/559.html">559</a></td>
     <td>CD1</td>
     <td>Editing error in issue 382 resolution</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="560">
     <td><a href="https://cplusplus.github.io/CWG/issues/560.html">560</a></td>
@@ -3419,7 +3419,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/561.html">561</a></td>
     <td>CD2</td>
     <td>Internal linkage functions in dependent name lookup</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="562">
     <td><a href="https://cplusplus.github.io/CWG/issues/562.html">562</a></td>
@@ -3437,19 +3437,19 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/564.html">564</a></td>
     <td>CD2</td>
     <td>Agreement of language linkage or <I>linkage-specification</I>s?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="565">
     <td><a href="https://cplusplus.github.io/CWG/issues/565.html">565</a></td>
     <td>CD3</td>
     <td>Conflict rules for <I>using-declaration</I>s naming function templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="566">
     <td><a href="https://cplusplus.github.io/CWG/issues/566.html">566</a></td>
     <td>NAD</td>
     <td>Conversion of negative floating point values to integer type</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="567">
     <td><a href="https://cplusplus.github.io/CWG/issues/567.html">567</a></td>
@@ -3467,7 +3467,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/569.html">569</a></td>
     <td>CD2</td>
     <td>Spurious semicolons at namespace scope should be allowed</td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="570">
     <td><a href="https://cplusplus.github.io/CWG/issues/570.html">570</a></td>
@@ -3485,7 +3485,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/572.html">572</a></td>
     <td>C++11</td>
     <td>Standard conversions for non-built-in types</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="573">
     <td><a href="https://cplusplus.github.io/CWG/issues/573.html">573</a></td>
@@ -3503,7 +3503,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/575.html">575</a></td>
     <td>C++11</td>
     <td>Criteria for deduction failure</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="576">
     <td><a href="https://cplusplus.github.io/CWG/issues/576.html">576</a></td>
@@ -3581,19 +3581,19 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/588.html">588</a></td>
     <td>CD2</td>
     <td>Searching dependent bases of classes local to function templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="589">
     <td><a href="https://cplusplus.github.io/CWG/issues/589.html">589</a></td>
     <td>CD2</td>
     <td>Direct binding of class and array rvalues in reference initialization</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="590">
     <td><a href="https://cplusplus.github.io/CWG/issues/590.html">590</a></td>
     <td>C++11</td>
     <td>Nested classes and the &#8220;current instantiation&#8221;</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="591">
     <td><a href="https://cplusplus.github.io/CWG/issues/591.html">591</a></td>
@@ -3641,7 +3641,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/598.html">598</a></td>
     <td>CD2</td>
     <td>Associated namespaces of overloaded functions and function templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="599">
     <td><a href="https://cplusplus.github.io/CWG/issues/599.html">599</a></td>
@@ -3659,19 +3659,19 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/601.html">601</a></td>
     <td>CD2</td>
     <td>Type of literals in preprocessing expressions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="602">
     <td><a href="https://cplusplus.github.io/CWG/issues/602.html">602</a></td>
     <td>C++11</td>
     <td>When is the injected-class-name of a class template a template?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="603">
     <td><a href="https://cplusplus.github.io/CWG/issues/603.html">603</a></td>
     <td>CD1</td>
     <td>Type equivalence and unsigned overflow</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="604">
     <td><a href="https://cplusplus.github.io/CWG/issues/604.html">604</a></td>
@@ -3695,13 +3695,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/607.html">607</a></td>
     <td>CD6</td>
     <td>Lookup of <I>mem-initializer-id</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="608">
     <td><a href="https://cplusplus.github.io/CWG/issues/608.html">608</a></td>
     <td>CD2</td>
     <td>Determining the final overrider of a virtual function</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="609">
     <td><a href="https://cplusplus.github.io/CWG/issues/609.html">609</a></td>
@@ -3713,13 +3713,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/610.html">610</a></td>
     <td>NAD</td>
     <td>Computing the negative of <TT>0U</TT></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="611">
     <td><a href="https://cplusplus.github.io/CWG/issues/611.html">611</a></td>
     <td>CD2</td>
     <td>Zero-initializing references</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="612">
     <td><a href="https://cplusplus.github.io/CWG/issues/612.html">612</a></td>
@@ -3731,19 +3731,19 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/613.html">613</a></td>
     <td>CD1</td>
     <td>Unevaluated uses of non-static class members</td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="full" align="center">Clang 3.1 (C++11 onwards)</td>
   </tr>
   <tr id="614">
     <td><a href="https://cplusplus.github.io/CWG/issues/614.html">614</a></td>
     <td>CD1</td>
     <td>Results of integer <TT>/</TT> and <TT>%</TT></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="615">
     <td><a href="https://cplusplus.github.io/CWG/issues/615.html">615</a></td>
     <td>C++11</td>
     <td>Incorrect description of variables that can be initialized</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="616">
     <td><a href="https://cplusplus.github.io/CWG/issues/616.html">616</a></td>
@@ -3761,13 +3761,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/618.html">618</a></td>
     <td>CD2</td>
     <td>Casts in preprocessor conditional expressions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="619">
     <td><a href="https://cplusplus.github.io/CWG/issues/619.html">619</a></td>
     <td>C++11</td>
     <td>Completeness of array types</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="620">
     <td><a href="https://cplusplus.github.io/CWG/issues/620.html">620</a></td>
@@ -3779,7 +3779,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/621.html">621</a></td>
     <td>C++11</td>
     <td>Template argument deduction from function return types</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="622">
     <td><a href="https://cplusplus.github.io/CWG/issues/622.html">622</a></td>
@@ -3803,19 +3803,19 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/625.html">625</a></td>
     <td>CD2</td>
     <td>Use of <TT>auto</TT> as a <I>template-argument</I></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="626">
     <td><a href="https://cplusplus.github.io/CWG/issues/626.html">626</a></td>
     <td>CD2</td>
     <td>Preprocessor string literals</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="627">
     <td><a href="https://cplusplus.github.io/CWG/issues/627.html">627</a></td>
     <td>NAD</td>
     <td>Values behaving as types</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="628">
     <td><a href="https://cplusplus.github.io/CWG/issues/628.html">628</a></td>
@@ -3833,7 +3833,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/630.html">630</a></td>
     <td>CD2</td>
     <td>Equality of narrow and wide character values in the basic character set</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="631">
     <td><a href="https://cplusplus.github.io/CWG/issues/631.html">631</a></td>
@@ -3845,7 +3845,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/632.html">632</a></td>
     <td>CD1</td>
     <td>Brace-enclosed initializer for scalar member of aggregate</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="633">
     <td><a href="https://cplusplus.github.io/CWG/issues/633.html">633</a></td>
@@ -3857,13 +3857,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/634.html">634</a></td>
     <td>CD1</td>
     <td>Conditionally-supported behavior for non-POD objects passed to ellipsis redux</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="635">
     <td><a href="https://cplusplus.github.io/CWG/issues/635.html">635</a></td>
     <td>NAD</td>
     <td>Names of constructors and destructors of templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="636">
     <td><a href="https://cplusplus.github.io/CWG/issues/636.html">636</a></td>
@@ -3875,7 +3875,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/637.html">637</a></td>
     <td>CD1</td>
     <td>Sequencing rules and example disagree</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="638">
     <td><a href="https://cplusplus.github.io/CWG/issues/638.html">638</a></td>
@@ -3899,13 +3899,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/641.html">641</a></td>
     <td>CD2</td>
     <td>Overload resolution and conversion-to-same-type operators</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="642">
     <td><a href="https://cplusplus.github.io/CWG/issues/642.html">642</a></td>
     <td>CD2</td>
     <td>Definition and use of &#8220;block scope&#8221; and &#8220;local scope&#8221;</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="643">
     <td><a href="https://cplusplus.github.io/CWG/issues/643.html">643</a></td>
@@ -3941,7 +3941,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/648.html">648</a></td>
     <td>CD1</td>
     <td>Constant expressions in constexpr initializers</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="649">
     <td><a href="https://cplusplus.github.io/CWG/issues/649.html">649</a></td>
@@ -3959,13 +3959,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/651.html">651</a></td>
     <td>CD1</td>
     <td>Problems in <TT>decltype</TT> specification and examples</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="652">
     <td><a href="https://cplusplus.github.io/CWG/issues/652.html">652</a></td>
     <td>CD2</td>
     <td>Compile-time evaluation of floating-point expressions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="653">
     <td><a href="https://cplusplus.github.io/CWG/issues/653.html">653</a></td>
@@ -3983,13 +3983,13 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/655.html">655</a></td>
     <td>C++11</td>
     <td>Initialization not specified for forwarding constructors</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="656">
     <td><a href="https://cplusplus.github.io/CWG/issues/656.html">656</a></td>
     <td>CD2</td>
     <td>Direct binding to the result of a conversion operator</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="657">
     <td><a href="https://cplusplus.github.io/CWG/issues/657.html">657</a></td>
@@ -4025,7 +4025,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/662.html">662</a></td>
     <td>NAD</td>
     <td>Forming a pointer to a reference type</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="663">
     <td><a href="https://cplusplus.github.io/CWG/issues/663.html">663</a></td>
@@ -4037,7 +4037,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/664.html">664</a></td>
     <td>CD2</td>
     <td>Direct binding of references to non-class rvalue references</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="665">
     <td><a href="https://cplusplus.github.io/CWG/issues/665.html">665</a></td>
@@ -4067,7 +4067,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/669.html">669</a></td>
     <td>NAD</td>
     <td>Confusing specification of the meaning of <TT>decltype</TT></td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="670">
     <td><a href="https://cplusplus.github.io/CWG/issues/670.html">670</a></td>
@@ -4091,7 +4091,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/673.html">673</a></td>
     <td>NAD</td>
     <td>Injection of names from <I>elaborated-type-specifier</I>s in <TT>friend</TT> declarations</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="674">
     <td><a href="https://cplusplus.github.io/CWG/issues/674.html">674</a></td>
@@ -4127,7 +4127,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/679.html">679</a></td>
     <td>CD1</td>
     <td>Equivalence of <I>template-id</I>s and operator function templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="680">
     <td><a href="https://cplusplus.github.io/CWG/issues/680.html">680</a></td>
@@ -4151,7 +4151,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/683.html">683</a></td>
     <td>CD1</td>
     <td>Requirements for trivial subobject special functions</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="684">
     <td><a href="https://cplusplus.github.io/CWG/issues/684.html">684</a></td>
@@ -4163,7 +4163,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/685.html">685</a></td>
     <td>CD2</td>
     <td>Integral promotion of enumeration ignores fixed underlying type</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 10</td>
   </tr>
   <tr id="686">
     <td><a href="https://cplusplus.github.io/CWG/issues/686.html">686</a></td>
@@ -4283,7 +4283,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/705.html">705</a></td>
     <td>CD2</td>
     <td>Suppressing argument-dependent lookup via parentheses</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="706">
     <td><a href="https://cplusplus.github.io/CWG/issues/706.html">706</a></td>
@@ -5663,7 +5663,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/974.html">974</a></td>
     <td>CD3</td>
     <td>Default arguments for lambdas</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="975">
     <td><a href="https://cplusplus.github.io/CWG/issues/975.html">975</a></td>
@@ -5681,7 +5681,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/977.html">977</a></td>
     <td>CD3</td>
     <td>When is an enumeration type complete?</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="978">
     <td><a href="https://cplusplus.github.io/CWG/issues/978.html">978</a></td>
@@ -8477,7 +8477,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/1443.html">1443</a></td>
     <td>NAD</td>
     <td>Default arguments and non-static data members</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="1444">
     <td><a href="https://cplusplus.github.io/CWG/issues/1444.html">1444</a></td>
@@ -9197,7 +9197,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/1563.html">1563</a></td>
     <td>CD3</td>
     <td>List-initialization and overloaded function disambiguation</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1564">
     <td><a href="https://cplusplus.github.io/CWG/issues/1564.html">1564</a></td>
@@ -10587,7 +10587,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/1794.html">1794</a></td>
     <td>C++17</td>
     <td><TT>template</TT> keyword and alias templates</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1795">
     <td><a href="https://cplusplus.github.io/CWG/issues/1795.html">1795</a></td>
@@ -10707,7 +10707,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/1814.html">1814</a></td>
     <td>CD4</td>
     <td>Default arguments in <I>lambda-expression</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1815">
     <td><a href="https://cplusplus.github.io/CWG/issues/1815.html">1815</a></td>
@@ -10755,7 +10755,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/1822.html">1822</a></td>
     <td>CD6</td>
     <td>Lookup of parameter names in <I>lambda-expression</I>s</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1823">
     <td><a href="https://cplusplus.github.io/CWG/issues/1823.html">1823</a></td>
@@ -12193,7 +12193,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2061.html">2061</a></td>
     <td>CD4</td>
     <td>Inline namespace after simplifications</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2062">
     <td><a href="https://cplusplus.github.io/CWG/issues/2062.html">2062</a></td>
@@ -12445,7 +12445,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2103.html">2103</a></td>
     <td>CD5</td>
     <td>Lvalue-to-rvalue conversion is irrelevant in odr-use of a reference</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2104">
     <td><a href="https://cplusplus.github.io/CWG/issues/2104.html">2104</a></td>
@@ -12907,7 +12907,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2180.html">2180</a></td>
     <td>CD4</td>
     <td>Virtual bases in destructors and defaulted assignment operators</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="2181">
     <td><a href="https://cplusplus.github.io/CWG/issues/2181.html">2181</a></td>
@@ -13105,7 +13105,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2213.html">2213</a></td>
     <td>CD6</td>
     <td>Forward declaration of partial specializations</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2214">
     <td><a href="https://cplusplus.github.io/CWG/issues/2214.html">2214</a></td>
@@ -14009,7 +14009,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2363.html">2363</a></td>
     <td>NAD</td>
     <td>Opaque enumeration friend declarations</td>
-    <td class="full" align="center">Yes</td>
+    <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2364">
     <td><a href="https://cplusplus.github.io/CWG/issues/2364.html">2364</a></td>

From 21378d4f0bf0e6c6858ee7fdf0db334fd89f1e7c Mon Sep 17 00:00:00 2001
From: arthurqiu <arthurq@nvidia.com>
Date: Mon, 6 Jan 2025 22:41:46 +0800
Subject: [PATCH 47/49] [MLIR] Fix triple mismatch warning for embedded
 libdevice (#121447)

IRLinker emits warning when linking two modules of different target
triples. The warning is disabled if the source module is libdevice. When
using libdevice embedded in LLVM library via MLIR_NVVM_EMBED_LIBDEVICE,
IRLinker can no longer tell whether the source module is libdevice via
module identifier.

Since `nvptx64-nvidia-gpulibs` is a magic triple that identifies the
libdevice module already, the libdevice filename check is redundant.
This patch fixes the triple mismatch warning by just removing the
filename check.
---
 llvm/lib/Linker/IRMover.cpp                        |  8 ++------
 llvm/test/Linker/Inputs/libdevice-with-wrong-dl.ll |  2 ++
 llvm/test/Linker/cuda-libdevice.ll                 | 10 +++++-----
 3 files changed, 9 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/Linker/Inputs/libdevice-with-wrong-dl.ll

diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index a0c3f2c5b0baf..be3535ae94ff4 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -1562,10 +1562,6 @@ Error IRLinker::run() {
   bool EnableDLWarning = true;
   bool EnableTripleWarning = true;
   if (SrcTriple.isNVPTX() && DstTriple.isNVPTX()) {
-    std::string ModuleId = SrcM->getModuleIdentifier();
-    StringRef FileName = llvm::sys::path::filename(ModuleId);
-    bool SrcIsLibDevice =
-        FileName.starts_with("libdevice") && FileName.ends_with(".10.bc");
     bool SrcHasLibDeviceDL =
         (SrcM->getDataLayoutStr().empty() ||
          SrcM->getDataLayoutStr() == "e-i64:64-v16:16-v32:32-n16:32:64");
@@ -1576,8 +1572,8 @@ Error IRLinker::run() {
                                   SrcTriple.getOSName() == "gpulibs") ||
                                  (SrcTriple.getVendorName() == "unknown" &&
                                   SrcTriple.getOSName() == "unknown");
-    EnableTripleWarning = !(SrcIsLibDevice && SrcHasLibDeviceTriple);
-    EnableDLWarning = !(SrcIsLibDevice && SrcHasLibDeviceDL);
+    EnableTripleWarning = !SrcHasLibDeviceTriple;
+    EnableDLWarning = !(SrcHasLibDeviceTriple && SrcHasLibDeviceDL);
   }
 
   if (EnableDLWarning && (SrcM->getDataLayout() != DstM.getDataLayout())) {
diff --git a/llvm/test/Linker/Inputs/libdevice-with-wrong-dl.ll b/llvm/test/Linker/Inputs/libdevice-with-wrong-dl.ll
new file mode 100644
index 0000000000000..bb000ef836cd4
--- /dev/null
+++ b/llvm/test/Linker/Inputs/libdevice-with-wrong-dl.ll
@@ -0,0 +1,2 @@
+target triple = "nvptx64-nvidia-gpulibs"
+target datalayout = "e-i64:64-i128:128-v32:32-n16:32:64"
diff --git a/llvm/test/Linker/cuda-libdevice.ll b/llvm/test/Linker/cuda-libdevice.ll
index 484e8339a136e..87136b1773777 100644
--- a/llvm/test/Linker/cuda-libdevice.ll
+++ b/llvm/test/Linker/cuda-libdevice.ll
@@ -4,8 +4,8 @@
 ; RUN: llvm-as %p/Inputs/libdevice-cuda-9.ll -o %t/libdevice.compute_35.10.bc
 ; RUN: llvm-as %p/Inputs/libdevice-cuda-10.ll -o %t/libdevice.10.bc
 ; RUN: llvm-as %p/Inputs/libdevice-cuda-11.ll -o %t/libdevice.11.10.bc
-; RUN: llvm-as %p/Inputs/libdevice-cuda-9.ll -o %t/correct-libdevice-wrong-filename.bc
 ; RUN: llvm-as %p/Inputs/not-a-libdevice.ll -o %t/libdevice-with-wrong-info.bc
+; RUN: llvm-as %p/Inputs/libdevice-with-wrong-dl.ll -o %t/libdevice-with-wrong-dl.bc
 
 ; No warnings expected when we link with libdevice variants
 ; RUN: llvm-link %t/main.bc %t/libdevice.compute_35.10.bc -S 2>&1 \
@@ -15,12 +15,12 @@
 ; RUN: llvm-link %t/main.bc %t/libdevice.11.10.bc -S 2>&1 \
 ; RUN:  | FileCheck --check-prefixes COMMON,NOWARN %s
 
-; But make sure we still issue warnings if we see unexpected filename, or
-; unexpected triple or datalayout within a libdevice filename.
-; RUN: llvm-link %t/main.bc %t/correct-libdevice-wrong-filename.bc -S 2>&1 \
-; RUN:  | FileCheck --check-prefixes COMMON,WARN-TRIPLE %s
+; But make sure we still issue warnings if we see unexpected triple or
+; datalayout within a libdevice module.
 ; RUN: llvm-link %t/main.bc %t/libdevice-with-wrong-info.bc -S 2>&1 \
 ; RUN:  | FileCheck --check-prefixes COMMON,WARN-TRIPLE,WARN-DL %s
+; RUN: llvm-link %t/main.bc %t/libdevice-with-wrong-dl.bc -S 2>&1 \
+; RUN:  | FileCheck --check-prefixes COMMON,NOWARN,WARN-DL %s
 
 
 target triple = "nvptx64-nvidia-cuda"

From 4d0b60e5e4bf010c2de6270c476a328a4d808ac3 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 6 Jan 2025 09:03:43 -0600
Subject: [PATCH 48/49] [libc] Fix sort test failing on NVPTX

Summary:
This test uses too much stack and crashes, make the buffer `static` to
push it to `.bss`. This shouldn't change behavior because the tests are
all run single threaded.
---
 libc/test/src/stdlib/SortingTest.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libc/test/src/stdlib/SortingTest.h b/libc/test/src/stdlib/SortingTest.h
index 611206ed708d2..681a420ea7274 100644
--- a/libc/test/src/stdlib/SortingTest.h
+++ b/libc/test/src/stdlib/SortingTest.h
@@ -298,8 +298,9 @@ class SortingTest : public LIBC_NAMESPACE::testing::Test {
     static_assert(ARRAY_LEN < 256); // so we can encode the values.
 
     // Minimum alignment to test implementation for bugs related to assuming
-    // incorrect association between alignment and element size.
-    alignas(1) uint8_t buf[BUF_SIZE];
+    // incorrect association between alignment and element size. The buffer is
+    // 'static' as otherwise it will exhaust the stack on the GPU targets.
+    alignas(1) static uint8_t buf[BUF_SIZE];
 
     // GCC still requires capturing the constant ARRAY_INITIAL_VALS in the
     // lambda hence, let's use & to implicitly capture all needed variables

From d5cf401e8ec1c77d72dc7f241eaffea3c058933d Mon Sep 17 00:00:00 2001
From: Paul Bowen-Huggett <paulhuggett@mac.com>
Date: Mon, 6 Jan 2025 09:50:46 +0100
Subject: [PATCH 49/49] [NFC] Make AMDGPUCombinerHelper methods const

This is a follow-up to a previous commit (ee7ca0d) which eliminated
several "TODO: make CombinerHelper methods const" remarks. As promised
in that ealier commit, this change completes the set by also making
the methods of AMDGPUCombinerHelper const so that the Helper member of
AMDGPUPreLegalizerCombinerImpl can be const rather than explicitly
mutable.
---
 llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index 9c25653f6b61e..46194ab46ff6a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -455,7 +455,7 @@ void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI,
 
 bool AMDGPUCombinerHelper::matchCombineFmulWithSelectToFldexp(
     MachineInstr &MI, MachineInstr &Sel,
-    std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_FMUL);
   assert(Sel.getOpcode() == TargetOpcode::G_SELECT);
   assert(MI.getOperand(2).getReg() == Sel.getOperand(0).getReg());